In [6]:
%matplotlib inline

import numpy as np
import random
import requests as rq
import sys
import io
import re
import pandas as pd
from gensim import models
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/abby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/abby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/abby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Loading the dataset
df_fox = pd.read_csv("./datasets/fox_news.csv")
df_reuters = pd.read_csv("./datasets//reuters.csv")
df_nyt = pd.read_csv("./datasets/nyt.csv")
df_the_hill = pd.read_csv("./datasets/the_hill.csv")
df_cnn = pd.read_csv("./datasets/cnn.csv")

In [8]:
#preprocess data

# dropping NAs
df_fox.dropna(inplace=True)
df_fox.reset_index(inplace=True,drop=True)

df_nyt.dropna(inplace=True)
df_nyt.reset_index(inplace=True,drop=True)

df_reuters.dropna(inplace=True)
df_reuters.reset_index(inplace=True,drop=True)

df_the_hill.dropna(inplace=True)
df_the_hill.reset_index(inplace=True,drop=True)

df_cnn.dropna(inplace=True)
df_cnn.reset_index(inplace=True,drop=True)

# dropping duplicate values
df_fox.drop_duplicates()
df_cnn.drop_duplicates()
df_the_hill.drop_duplicates()
df_reuters.drop_duplicates()
df_nyt.drop_duplicates()

Unnamed: 0,article
0,You have 300 words left.
1,"“Sancho: An Act of Remembrance,” which was ori..."
2,"Deconstructing the party of the year, includin..."
3,11 Photos View Slide Show › Whether you’re her...
4,Sometimes plain old pasta with red sauce is ju...
...,...
54431,"FLORHAM PARK, N.J. — A day after the Jets fire..."
54432,"Warren Wells, a former wide receiver who becam..."
54433,The Christian right doesn’t like the president...
54434,"AUG. 29, 2018\n \nAdd an event to your calenda..."


In [9]:
def remove_link_punc(string):
    # removing links
    temp_string = re.sub('http[s]?://(?:[a-zA-Z]|[0–9]|[$-_@.&+]|(?:%[0–9a-fA-F][0–9a-fA-F]))+', ' ', string)
    
    # removing all everything except a-z english letters
    regex = re.compile('[^a-zA-Z]')
    temp_string = regex.sub(' ', temp_string)
    
    # removing extra spaces
    clean_string = re.sub(' +', ' ', temp_string).lower()
    return clean_string


def data_cleaning(content):
    sentences = []
    for idx in range(len(content)):
        if content[idx] !="":
            # Sentence tokenization using NLTK library
            for each_sent in sent_tokenize(str(content[idx])):
                if each_sent != "":
                    temp_sent = []

                # Removing link and punctuation
                each_sent = remove_link_punc(each_sent.lower())

                # Removing stopwords and applying lemmatization
                for each_word in each_sent.split():
                    if each_word not in stop_words and len(each_word)>= 3:
                        temp_sent.append(lemmatizer.lemmatize(each_word))

                # Only taking word list if length is greater than or equal to 5
                if len(temp_sent) >= 5:
                    sentences.append(temp_sent)
    return sentences

In [10]:
fox_corpus = data_cleaning(df_fox.article)
reuters_corpus = data_cleaning(df_reuters.article)
nyt_corpus = data_cleaning(df_nyt.article)
cnn_corpus = data_cleaning(df_cnn.article)
the_hill_corpus = data_cleaning(df_the_hill.article)

In [12]:
#training word2vec model with our data

model = Word2Vec(sentences=fox_corpus, window=4, min_count=5, workers=4, sg=1)

In [19]:
model_cnn = Word2Vec(sentences=cnn_corpus, window=4, min_count=5, workers=4, sg=1)

In [21]:
model_the_hill = Word2Vec(sentences=the_hill_corpus, window=4, min_count=5, workers=4, sg=1)

In [22]:
model_reuters = Word2Vec(sentences=reuters_corpus, window=4, min_count=5, workers=4, sg=1)

In [23]:
model_nyt = Word2Vec(sentences=nyt_corpus, window=4, min_count=5, workers=4, sg=1)

In [24]:
model.save('./models/fox.model')
model_cnn.save('./models/cnn.model')
model_the_hill.save('./models/the_hill.model')
model_reuters.save('./models/reuters.model')
model_nyt.save('./models/nyt.model')