In [None]:
# =====================[ WIKIPEDIA CITATION ANALYZER - FINAL WORKING VERSION ]=====================
import requests
import re
import csv
import time
from datetime import datetime
from urllib.parse import quote
from google.colab import files
from google.colab import userdata
from typing import List, Dict

# =====================[ ASCII ART HEADER ]=====================
def display_header():
    print(r"""
+-----------------------------------------------+
|  🔍 Wikipedia Citation Analyzer (Bot Edition) |
+-----------------------------------------------+
|  • Bot-authenticated API access              |
|  • Scans for incomplete Cite web templates   |
|  • Processes multiple articles               |
|  • Created by Addobentil@CitationAnalyzerBot |
+-----------------------------------------------+
""")

# =====================[ CONFIGURATION ]=====================
class Config:
    API_URL = "https://en.wikipedia.org/w/api.php"
    USER_AGENT = "CitationAnalyzerBot/1.0"
    BOT_USERNAME = userdata.get('WIKI_BOT_USERNAME')
    BOT_PASSWORD = userdata.get('WIKI_BOT_PASSWORD')
    REQUEST_DELAY = 0.3
    MAX_ARTICLES = 2000
    BATCH_SIZE = 50

# =====================[ BOT ANALYZER CLASS ]=====================
class WikipediaBotAnalyzer:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': Config.USER_AGENT})
        self._login()
        self.csv_filename = ""
        self.results = []

    def _login(self):
        """Authenticate bot account"""
        print("\n🔐 Authenticating bot...")
        try:
            token_params = {
                'action': 'query',
                'meta': 'tokens',
                'type': 'login',
                'format': 'json'
            }
            token_response = self.session.get(Config.API_URL, params=token_params).json()
            login_token = token_response['query']['tokens']['logintoken']

            auth_params = {
                'action': 'login',
                'lgname': Config.BOT_USERNAME,
                'lgpassword': Config.BOT_PASSWORD,
                'lgtoken': login_token,
                'format': 'json'
            }
            
            login_response = self.session.post(Config.API_URL, data=auth_params).json()
            if login_response.get('login', {}).get('result') != 'Success':
                raise Exception("❌ Bot login failed. Check credentials.")
            print("✅ Authentication successful\n")
        except Exception as e:
            print(f"❌ Login error: {str(e)}")
            raise

    def api_request(self, params: dict, max_retries=3):
        """Make authenticated API requests"""
        for attempt in range(max_retries):
            try:
                time.sleep(Config.REQUEST_DELAY)
                params['format'] = 'json'
                response = self.session.post(Config.API_URL, data=params)
                
                if response.status_code == 429:
                    retry_after = int(response.headers.get('Retry-After', 5))
                    print(f"⏳ Rate limited. Waiting {retry_after}s...")
                    time.sleep(retry_after)
                    continue
                    
                response.raise_for_status()
                return response.json()
                
            except Exception as e:
                print(f"⚠️ Attempt {attempt+1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    raise
                time.sleep((attempt + 1) * 2)

    def get_wikitext_batch(self, titles: List[str]) -> Dict[str, str]:
        """Fetch multiple articles at once"""
        params = {
            'action': 'query',
            'prop': 'revisions',
            'rvprop': 'content',
            'rvslots': 'main',
            'titles': '|'.join(titles),
            'format': 'json',
            'formatversion': '2'
        }
        
        data = self.api_request(params)
        results = {}
        for page in data.get('query', {}).get('pages', []):
            if 'missing' not in page:
                results[page['title']] = page['revisions'][0]['slots']['main']['content']
        return results

    def analyze_citations(self, wikitext: str) -> Dict:
        """Analyze citations in wikitext"""
        if not wikitext:
            return {'total': 0, 'incomplete': [], 'problems': []}

        wikitext = wikitext.replace("{{!}}", "|").replace("{{pipe}}", "|")
        pattern = r"\{\{\s*Cite\s+web\s*\|([^{}]*?(?:\{[^{}]*\}[^{}]*)*)\}\}"
        citations = re.findall(pattern, wikitext, re.IGNORECASE | re.DOTALL)

        results = {
            'total': len(citations),
            'incomplete': [],
            'problems': []
        }

        for citation in citations:
            missing = []
            text = citation.lower().replace(" ", "")

            if not re.search(r"title\s*=", text):
                missing.append("title")
            if not re.search(r"url\s*=", text):
                missing.append("url")

            if missing:
                results['incomplete'].append(", ".join(missing))
                results['problems'].append({
                    'missing': ", ".join(missing),
                    'text': f"{{{{Cite web|{citation}}}}}"
                })

        return results

    def save_to_csv(self):
        """Save results with specified headers"""
        if not self.results:
            return ""

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.csv_filename = f"citation_analysis_{timestamp}.csv"
        
        with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=[
                'article title',
                'number of total citations',
                'number of incomplete citations',
                'missing fields',
                'problematic citations'
            ])
            writer.writeheader()
            
            for item in self.results:
                problematic_citations = "\n\n".join(
                    [f"Missing: {prob['missing']}\n{prob['text']}" 
                     for prob in item['problems']]
                ) if item['problems'] else "None"
                
                writer.writerow({
                    'article title': item['title'],
                    'number of total citations': item['total'],
                    'number of incomplete citations': len(item['incomplete']),
                    'missing fields': "; ".join(set(item['incomplete']))
                })
        
        return self.csv_filename

# =====================[ USER INTERFACE ]=====================
def get_article_list():
    """Get multiple articles from user input"""
    print("\nEnter article titles (one per line, blank line to finish):")
    articles = []
    while True:
        article = input("> ").strip()
        if not article:  # Empty line ends input
            break
        articles.append(article)
    return articles

def analyze_specific_articles(analyzer):
    """Analyze specific articles entered by user (no CSV output)"""
    articles = get_article_list()
    if not articles:
        print("\nNo articles entered. Returning to menu.")
        return
    
    print(f"\n🔍 Starting analysis of {len(articles)} articles...")
    
    total_processed = 0
    batch_size = Config.BATCH_SIZE
    
    for i in range(0, len(articles), batch_size):
        batch = articles[i:i + batch_size]
        print(f"\n=== Processing Articles {i+1}-{min(i+batch_size, len(articles))} ===")
        
        try:
            wikitexts = analyzer.get_wikitext_batch(batch)
            for title, text in wikitexts.items():
                print(f"\n📄 Article: {title}")
                analysis = analyzer.analyze_citations(text)

                print(f"\n📊 Total citations: {analysis['total']}")
                print(f"⚠️ Incomplete citations: {len(analysis['incomplete'])}")

                if analysis['problems']:
                    print("\nProblematic citations:")
                    for i, problem in enumerate(analysis['problems'], 1):
                        print(f"\n{i}. Missing: {problem['missing']}")
                        print(problem['text'])
                else:
                    print("✅ All citations are complete!")
                
                print("\n" + "-"*50)  # Separator between articles
                
            total_processed += len(wikitexts)
            
        except Exception as e:
            print(f"⚠️ Error processing batch: {str(e)}")
            continue
    
    print(f"\n✅ Analysis complete! Processed {total_processed} articles")

def main():
    display_header()
    analyzer = WikipediaBotAnalyzer()
    
    while True:
        print("\n" + "="*50)
        print("1. Scan random articles using Cite web template (exports CSV)")
        print("2. Analyze specific article(s) (screen output only)")
        print("3. Exit")
        
        choice = input("\nChoose option (1-3): ").strip()
        
        if choice == "1":
            try:
                limit = int(input(f"\nHow many articles to scan? (Max {Config.MAX_ARTICLES}): ") or "100")
                limit = min(limit, Config.MAX_ARTICLES)
                
                print(f"\n🔍 Starting analysis of {limit} articles...")
                
                # Get pages using template
                params = {
                    'action': 'query',
                    'list': 'embeddedin',
                    'eititle': 'Template:Cite_web',
                    'eilimit': Config.BATCH_SIZE,
                    'format': 'json'
                }
                
                articles = []
                while len(articles) < limit:
                    batch_start = len(articles) + 1
                    batch_end = min(len(articles) + Config.BATCH_SIZE, limit)
                    print(f"\n=== Processing Articles {batch_start}-{batch_end} ===")
                    print(f"  Fetching wikitext for {batch_end - batch_start + 1} articles...")
                    
                    data = analyzer.api_request(params)
                    new_articles = data.get('query', {}).get('embeddedin', [])
                    articles.extend(new_articles)
                    
                    # Process batch immediately after fetching
                    if new_articles:
                        titles = [a['title'] for a in new_articles]
                        print(f"\nAnalyzing batch {len(articles)//Config.BATCH_SIZE}/{(limit-1)//Config.BATCH_SIZE + 1}")
                        wikitexts = analyzer.get_wikitext_batch(titles)
                        
                        for title, text in wikitexts.items():
                            print(f"  Checking: {title[:50]}...")
                            analysis = analyzer.analyze_citations(text)
                            
                            if analysis['incomplete']:
                                analyzer.results.append({
                                    'title': title,
                                    **analysis
                                })
                    
                    if 'continue' not in data or len(articles) >= limit:
                        break
                    params.update(data['continue'])
                
                total_incomplete = sum(len(item['incomplete']) for item in analyzer.results)
                print(f"\n✅ Scan complete! Found {len(analyzer.results)} articles with {total_incomplete} incomplete citations")
                
                # Save and show results
                if analyzer.results:
                    print("\n💾 Saving results...")
                    filename = analyzer.save_to_csv()
                    print(f"\n📏 Results saved to: {filename}")
                    print("\nTo download:")
                    print(f"1. Click the Colab folder icon📁 on the left-sidebar")
                    print(f"2. Right click on the file and select download.")
                
            except Exception as e:
                print(f"\n❌ Error during scan: {str(e)}")
                
        elif choice == "2":
            analyzer.results = []  # Clear previous results
            analyze_specific_articles(analyzer)
        
        elif choice == "3":
            print("\nThank you for using Wikipedia Citation Analyzer!")
            break
            
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")

if __name__ == "__main__":
    main()