In [None]:
import wikipediaapi
import random
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import gensim

from flask import Flask, request

In [29]:
def crawl_wikipedia():
    # Initialize Wikipedia API with a custom user agent
    # This is important to identify the crawler to Wikipedia servers
    wiki_wiki = wikipediaapi.Wikipedia(
        language='en', 
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent='MyCustomWikipediaCrawler/1.0 (https://mywebsite.com)'
    )

    # List to store all the page contents
    all_pages = []
    
    # Categories we want to crawl
    categories = ["Category:History", "Category:Science", "Category:Art", "Category:Technology"]

    # Iterate over each category
    for category in categories:
        # Get the page object for the category
        cat = wiki_wiki.page(category)
        # Extract all members (pages) of the category
        pages = cat.categorymembers.values()

        # Iterate over each page in the category
        for page in pages:
            # Check if the page is an article (namespace = MAIN)
            if page.ns == wikipediaapi.Namespace.MAIN:
                # Append the text of the page to the list
                all_pages.append(page.text)
                # Break if we have collected 100 pages
                if len(all_pages) >= 100:
                    break

        # Break the outer loop if we have collected 100 pages
        if len(all_pages) >= 100:
            break

    # Return the list of page contents
    return all_pages

# Execute the crawl
data = crawl_wikipedia()

In [None]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\W+', ' ', text)  # Remove special characters and punctuation
    words = word_tokenize(text)  # Tokenize into words
    words = [word for word in words if word.isalpha()]  # Keep only alphabetic words
    return ' '.join(words)

# Preprocess all pages
preprocessed_data = [preprocess_text(page) for page in data]

In [None]:
# Tokenize each preprocessed page into sentences
tokenized_data = [word_tokenize(page) for page in preprocessed_data]


### Part 3

In [None]:
# Training the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=500, min_count=1, workers=4)


In [None]:
model.save("word2vec_wikipedia.model")

In [None]:
tokenized_data

In [None]:
 
model.build_vocab(tokenized_data, update=False)
model.train(tokenized_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('word2vec_wikipedia.model')

In [None]:
model = gensim.models.Word2Vec.load("word2vec_wikipedia.model")

In [None]:
reference_pair = ("science", "history")
target_word = "ancient"
result_vector = model.wv[target_word] - model.wv[reference_pair[0]] + model.wv[reference_pair[1]]
opposite_words = model.wv.similar_by_vector(result_vector)
print(opposite_words)

In [None]:
app = Flask(__name__)

html_form_with_message = '''
<!DOCTYPE html>
<html>
<head>
<title>Text Echo App</title>
</head>
<body>
    <h2>Enter Text</h2>
    <form method="post" action="/">
        <label for="text">Text:</label><br>
        <input type="text" name="my_input_value"><br><br>
        <input type="submit" value="My Button">
    </form>
    <p>The opposite is: put_data_here</p>
</body>
</html>
'''
def my_input(value):
    reference_pair = ("science", "history")
    result_vector = model.wv[value] - model.wv[reference_pair[0]] + model.wv[reference_pair[1]]
    opposite_words = model.wv.similar_by_vector(result_vector)
    return(opposite_words[0][0])

@app.route('/', methods=['GET', 'POST'])
def home():
    user_input = ''
    opposite_input = ''
    if request.method == 'POST':
        user_input = request.form['my_input_value']
        opposite_input = my_input(user_input)
    return html_form_with_message.replace("put_data_here", opposite_input)

app.run()

1. **Library Imports and Initial Setup:**
   - Importing essential libraries. `wikipediaapi` is used for accessing and retrieving data from Wikipedia, while `nltk` (Natural Language Toolkit) and `gensim` are crucial for natural language processing and machine learning tasks respectively.
   - The script also includes the downloading of 'punkt', a pre-trained tokenizer model from the NLTK library, essential for text tokenization tasks.

2. **Function to Crawl Wikipedia Pages:**
   - A custom function named `crawl_wikipedia` is defined to systematically scrape content from Wikipedia. 
   - The function sets up the Wikipedia API with specific parameters like the desired language (English) and a custom user agent for identification purposes.
   - The scraper targets four specific categories: History, Science, Art, and Technology. It iterates through each category, extracting the main content from each page.
   - A limit is set to stop the crawling process after accumulating content from 100 pages, balancing the need for a sizable dataset with the practicality of processing time and resource allocation.

3. **Execution of Web Scraping:**
   - This part of the notebook is where the `crawl_wikipedia` function is called into action. The output is a collection of text data from the targeted Wikipedia pages.

4. **Text Preprocessing Function:**
   - Introduces a function `preprocess_text` to clean and prepare the scraped text for analysis. 
   - This preprocessing includes converting all text to lowercase (to ensure uniformity), removing special characters and punctuation (to focus on actual words), and tokenizing the text into individual words.
   - The function filters out tokens that are not purely alphabetic, thereby removing numbers and any remaining special characters.

5. **Tokenization of Preprocessed Data:**
   - Proceeds to tokenize the preprocessed text data into words. This step is crucial for word embedding models, which require word-level inputs.

6. **Training the Word2Vec Model:**
   - A Word2Vec model from the `gensim` library is trained on the tokenized text. 
   - The model parameters include a vector size of 500, which determines the dimensionality of the word vectors, and a minimum count of 1 for words to be included in the model training. 
   - The training utilizes 4 worker threads, indicating a multi-threaded approach to speed up the training process.

7. **Saving the Word2Vec Model:**
   - The trained Word2Vec model is saved to a file, allowing for its reuse without the need to retrain it.

8. **Displaying Tokenized Data:**
   - This part of the notebook intended for displaying the tokenized data, possibly for verification or review purposes.

9. **Model Vocabulary Building and Retraining:**
   - The script rebuilds the model's vocabulary with the tokenized data and retrains the Word2Vec model.
   - The retrained model is then saved, presumably to update it with any new data or adjustments.

10. **Loading the Trained Model:**
    - The trained and saved Word2Vec model is loaded from the file for further use.

11. **Word Vector Arithmetic and Similarity Calculation:**
    - Demonstrates an application of the trained Word2Vec model. 
    - It performs vector arithmetic using words like "science," "history," and "ancient" to find semantically similar words in a shifted context.
    - The idea is to explore how the meaning of "ancient" changes when the context moves from "science" to "history".
    - The results of this vector arithmetic are printed, showcasing the model's capability to understand and manipulate word meanings based on their vector representations.
