# Step 0: Importing Required Packages

In [1]:
import pandas as pd
import gensim
from scipy import stats

# Step 1: Data Loading

In [9]:
news_df = pd.read_csv("/content/reddit_worldnews_start_to_2016-11-22.csv")
#file_2 = pd.read_csv("/content/reddit_worldnews_2016_2019.csv")
#file_3 = pd.read_csv("/content/reddit_worldnews_2019_2020.csv")

# Step 2: Data Preprocessing

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

#########################################################
def preprocess_df(df: pd.Series) -> pd.DataFrame:
    """
    Preprocess the input DataFrame.

    This function should perform text preprocessing task which may include:
      - Converting text to lowercase
      - Removing punctuations and whitespaces
      - Removing stopwords
      - Stemming or Lemmatizing
      - (Other preprocessing steps as you wish)

    Parameters:
        df (pd.DataFrame): A DataFrame containing at least a column

    Returns:
        pd.DataFrame: The DataFrame with the preprocessed column.
    """
    df = df.str.lower()
    df = df.str.translate(str.maketrans('', '', string.punctuation))
    df = df.str.replace('\s+', ' ')
    stop_words = set(stopwords.words('english'))
    df = df.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    lemmatizer = WordNetLemmatizer()
    df = df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    # TODO: Implement the preprocessing logic
    # df = df.str.lower()  # Convert to lowercase
    # You can extend this with additional preprocessing steps.

    # Example (simple preprocessing):
    df = df.apply(gensim.utils.simple_preprocess)

    return df

processed_df = preprocess_df(news_df['title'])







[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
processed_df

Unnamed: 0,title
0,"[score, killed, pakistan, clash]"
1,"[japan, resume, refuelling, mission]"
2,"[press, egypt, gaza, border]"
3,"[jumpstart, economy, give, health, care]"
4,"[council, europe, bash, euun, terror, blacklist]"
...,...
509231,"[heil, trump, donald, trump, altright, white, ..."
509232,"[people, speculating, could, madeleine, mccann]"
509233,"[professor, receives, arab, researcher, award]"
509234,"[nigel, farage, attack, response, trump, ambas..."


# Step 3: Building the Model

**Parameters:**

**vector_size** = (int) - Dimensionality of the feature vectors.

**alpha** = (float) - The initial learning rate

**window** = (int) - The maximum distance between the current and predicted word within a sentence.

**min_count** = (int) - Ignores all words with total frequency lower than this.

**epochs** = (int) - Number of iterations over the whole dataset


more info: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial#Training-the-model

In [26]:
model = gensim.models.Word2Vec(
        window=10,
        vector_size=100,
        alpha=0.06,
        min_count=7,
        epochs=20,
        workers=4,
        )

model.build_vocab(processed_df)

'''model = gensim.models.Word2Vec(
        window=10,
        vector_size=100,
        alpha=0.03,
        min_count=7,
        epochs=20,
        workers=4,
        )

model.build_vocab(processed_df)'''



'model = gensim.models.Word2Vec(\n        window=10,\n        vector_size=100,\n        alpha=0.03,\n        min_count=7,\n        epochs=20,\n        workers=4,\n        )\n\nmodel.build_vocab(processed_df)'

# Step 4: Training the Model

In [27]:
model.train(processed_df, total_examples=model.corpus_count, epochs=model.epochs)

(95580772, 99473020)

# Step 5: Testing the Model

In [29]:
df_test = pd.read_csv("wordsim353crowd.csv")
df_test['Human (Mean)']/=df_test['Human (Mean)'].max()
predictions = []
gt_list = []
s=0
for row in df_test.iterrows():
  try:
    model_output = model.wv.similarity(w1=row[1]['Word 1'], w2=row[1]['Word 2'])
    predictions.append(model_output)
    gt_list.append(row[1]['Human (Mean)'])
  except:
    s+=1
    pass
spearmanr_score = stats.spearmanr(predictions, gt_list)
print("___** FINAL RESULTS **___\n")
print(s)
print(f'spearmanr_score: {spearmanr_score.statistic}')

___** FINAL RESULTS **___

38
spearmanr_score: 0.590155002506057
