In [None]:
%matplotlib inline

import numpy as np
import random
import requests as rq
import sys
import io
import re
import pandas as pd
from gensim import models
from google.colab import files
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
uploaded = files.upload()


Saving fox_news.csv to fox_news.csv


In [None]:
# Loading the dataset
df_fox = pd.read_csv(io.BytesIO(uploaded['fox_news.csv']))

In [None]:
#preprocess data

# dropping NAs
df_fox.dropna(inplace=True)
df_fox.reset_index(inplace=True,drop=True)

# dropping duplicate values
df_fox.drop_duplicates()

Unnamed: 0,article
0,Consumer Reports has no financial relationship...
1,\n Parma Justice Center (pictured) si...
2,"HUNTSVILLE, Texas – The lure of drugs and sex..."
3,"\n November 19, 2015. Will Smith pos..."
4,\n A video has gone viral of a teenag...
...,...
17619,close Video House Rules Committee to meet on r...
17620,\n FBI agents searched a Georgia land...
17621,close Video Kim Jong Un’s most bizarre claims ...
17622,close Video Moon golf: How Astronaut Alan Shep...


In [None]:
def remove_link_punc(string):
    # removing links
    temp_string = re.sub('http[s]?://(?:[a-zA-Z]|[0–9]|[$-_@.&+]|(?:%[0–9a-fA-F][0–9a-fA-F]))+', ' ', string)
    
    # removing all everything except a-z english letters
    regex = re.compile('[^a-zA-Z]')
    temp_string = regex.sub(' ', temp_string)
    
    # removing extra spaces
    clean_string = re.sub(' +', ' ', temp_string).lower()
    return clean_string


def data_cleaning(content):
    sentences = []
    for idx in range(len(content)):
        if content[idx] !="":
            # Sentence tokenization using NLTK library
            for each_sent in sent_tokenize(str(content[idx])):
                if each_sent != "":
                    temp_sent = []

                # Removing link and punctuation
                each_sent = remove_link_punc(each_sent.lower())

                # Removing stopwords and applying lemmatization
                for each_word in each_sent.split():
                    if each_word not in stop_words and len(each_word)>= 3:
                        temp_sent.append(lemmatizer.lemmatize(each_word))

                # Only taking word list if length is greater than or equal to 5
                if len(temp_sent) >= 5:
                    sentences.append(temp_sent)
    return sentences

In [None]:
fox_corpus = data_cleaning(df_fox.article)

In [None]:
len(fox_corpus)

438668

In [None]:
fox_corpus[235]

['dot',
 'posted',
 'photo',
 'smoke',
 'crack',
 'coming',
 'roadway',
 'behind',
 'barricade']

In [None]:
#training word2vec model with oue data

model = Word2Vec(sentences=fox_corpus, size=200, window=4, min_count=1, workers=4)

In [None]:
model.wv.similar_by_word("gun")

[('handgun', 0.644544243812561),
 ('firearm', 0.6086054444313049),
 ('pistol', 0.5912555456161499),
 ('rifle', 0.5761883854866028),
 ('shotgun', 0.5520564317703247),
 ('caliber', 0.5139201879501343),
 ('stun', 0.5013119578361511),
 ('bullet', 0.4948554039001465),
 ('knife', 0.48430222272872925),
 ('vehicle', 0.4762563109397888)]

Checking the distances between word and adjective

In [None]:


adjectives = ['disorganized', 'devious', 'impressionable', 'circumspect', 'impassive', 
             'aimless', 'effeminate', 'unfathomable', 'fickle', 'unprincipled', 'inoffensive', 
             'reactive', 'providential', 'resentful', 'bizarre', 'impractical',
             'sarcastic', 'misguided', 'imitative', 'pedantic', 'venomous', 'erratic', 'insecure', 
             'resourceful', 'neurotic', 'forgiving', 'profligate', 'whimsical', 'assertive', 
             'incorruptible', 'individualistic', 'faithless', 'disconcerting', 'barbaric', 
             'hypnotic', 'vindictive', 'observant', 'dissolute', 'frightening', 'complacent', 
             'boisterous', 'pretentious', 'disobedient', 'tasteless', 'sedentary', 
             'sophisticated', 'regimental', 'mellow', 'deceitful', 'impulsive', 'playful', 
             'sociable', 'methodical', 'willful', 'idealistic', 'boyish', 'callous', 'pompous', 
             'unchanging', 'crafty', 'punctual', 'compassionate', 'intolerant', 'challenging', 
             'scornful', 'possessive', 'conceited', 'imprudent', 'dutiful', 'lovable', 
             'disloyal', 'dreamy', 'appreciative', 'forgetful', 'unrestrained', 'forceful', 
             'submissive', 'predatory', 'fanatical', 'illogical', 'tidy', 'aspiring', 'studious', 
             'adaptable', 'conciliatory', 'artful', 'thoughtless', 'deceptive', 'frugal', 
             'reflective', 'insulting', 'unreliable', 'stoic', 'hysterical', 'rustic', 
             'inhibited', 'outspoken', 'unhealthy', 'ascetic', 'skeptical', 'painstaking', 
             'contemplative', 'leisurely', 'sly', 'mannered', 'outrageous', 'lyrical', 
             'placid', 'cynical', 'irresponsible', 'vulnerable', 'arrogant', 'persuasive', 
             'perverse', 'steadfast', 'crisp', 'envious', 'naive', 'greedy', 'presumptuous', 
             'obnoxious', 'irritable', 'dishonest', 'discreet', 'sporting', 'hateful', 
             'ungrateful', 'frivolous', 'reactionary', 'skillful', 'cowardly', 'sordid', 
             'adventurous', 'dogmatic', 'intuitive', 'bland', 'indulgent', 'discontented', 
             'dominating', 'articulate', 'fanciful', 'discouraging', 'treacherous', 
             'repressed', 'moody', 'sensual', 'unfriendly', 'optimistic', 'clumsy', 
             'contemptible', 'focused', 'haughty', 'morbid', 'disorderly', 'considerate', 
             'humorous', 'preoccupied', 'airy', 'impersonal', 'cultured', 'trusting', 
             'respectful', 'scrupulous', 'scholarly', 'superstitious', 'tolerant', 
             'realistic', 'malicious', 'irrational', 'sane', 'colorless', 'masculine', 
             'witty', 'inert', 'prejudiced', 'fraudulent', 'blunt', 'childish', 'brittle', 
             'disciplined', 'responsive', 'courageous', 'bewildered', 'courteous', 
             'stubborn', 'aloof', 'sentimental', 'athletic', 'extravagant', 'brutal', 
             'manly', 'cooperative', 'unstable', 'youthful', 'timid', 'amiable', 'retiring', 
             'fiery', 'confidential', 'relaxed', 'imaginative', 'mystical', 'shrewd', 
             'conscientious', 'monstrous', 'grim', 'questioning', 'lazy', 'dynamic', 
             'gloomy', 'troublesome', 'abrupt', 'eloquent', 'dignified', 'hearty', 'gallant', 
             'benevolent', 'maternal', 'paternal', 'patriotic', 'aggressive', 'competitive', 
             'elegant', 'flexible', 'gracious', 'energetic', 'tough', 'contradictory', 
             'shy', 'careless', 'cautious', 'polished', 'sage', 'tense', 'caring', 
             'suspicious', 'sober', 'neat', 'transparent', 'disturbing', 'passionate', 
             'obedient', 'crazy', 'restrained', 'fearful', 'daring', 'prudent', 'demanding', 
             'impatient', 'cerebral', 'calculating', 'amusing', 'honorable', 'casual',
             'sharing', 'selfish', 'ruined', 'spontaneous', 'admirable', 'conventional', 
             'cheerful', 'solitary', 'upright', 'stiff', 'enthusiastic', 'petty', 'dirty', 
             'subjective', 'heroic', 'stupid', 'modest', 'impressive', 'orderly', 'ambitious', 
             'protective', 'silly', 'alert', 'destructive', 'exciting', 'crude', 'ridiculous', 
             'subtle', 'mature', 'creative', 'coarse', 'passive', 'oppressed', 'accessible', 
             'charming', 'clever', 'decent', 'miserable', 'superficial', 'shallow', 'stern', 
             'winning', 'balanced', 'emotional', 'rigid', 'invisible', 'desperate', 'cruel', 
             'romantic', 'agreeable', 'hurried', 'sympathetic', 'solemn', 'systematic', 
             'vague', 'peaceful', 'humble', 'dull', 'expedient', 'loyal', 'decisive', 
             'arbitrary', 'earnest', 'confident', 'conservative', 'foolish', 'moderate', 
             'helpful', 'delicate', 'gentle', 'dedicated', 'hostile', 'generous', 'reliable', 
             'dramatic', 'precise', 'calm', 'healthy', 'attractive', 'artificial', 
             'progressive', 'odd', 'confused', 'rational', 'brilliant', 'intense', 
             'genuine', 'mistaken', 'driving', 'stable', 'objective', 'sensitive', 
             'neutral', 'strict', 'angry', 'profound', 'smooth', 'ignorant', 'thorough', 
             'logical', 'intelligent', 'extraordinary', 'experimental', 'steady', 
             'formal', 'faithful', 'curious', 'reserved', 'honest', 'busy', 'educated', 
             'liberal', 'friendly', 'efficient', 'sweet', 'surprising', 'mechanical', 
             'clean', 'critical', 'criminal', 'soft', 'proud', 'quiet', 'weak', 'anxious', 
             'solid', 'complex', 'grand', 'warm', 'slow', 'false', 'extreme', 'narrow', 
             'dependent', 'wise', 'organized', 'pure', 'directed', 'dry', 'obvious', 'popular', 
             'capable', 'secure', 'active', 'independent', 'ordinary', 'fixed', 'practical', 
             'serious', 'fair', 'understanding', 'constant', 'cold', 'responsible', 'deep', 
             'religious', 'private', 'simple', 'physical', 'original', 'working', 'strong', 
             'modern', 'determined', 'open', 'political', 'difficult', 'knowledge', 'kind']



In [None]:
testing_words = ["gun", "metoo", "america", "tax", "abortion", "democrat", "republican", "nra", "trump", "biden", "china", "gay", "lesbian"]

In [None]:



for word in testing_words:
  fox_gun_dists = []
  for ind in range(len(adjectives)):
    adj = adjectives[ind]
    try:
      dist = model.wv.distance(word, adj)
      fox_gun_dists.append([word, adj, dist])
    except:
      fox_gun_dists.append([word, adj, None])
    

  df = pd.DataFrame(fox_gun_dists, columns=['word', 'adj', 'dist'])

    
  print(df.shape)

  df_sorted = df.sort_values(by='dist', ascending=True)
  print(df_sorted.head(5))


(423, 3)
    word        adj      dist
342  gun    driving  0.654319
167  gun  malicious  0.724105
373  gun   criminal  0.725976
400  gun      fixed  0.726856
299  gun  invisible  0.727636
(423, 3)
      word          adj      dist
334  metoo  progressive  0.541311
155  metoo     humorous  0.551896
339  metoo      intense  0.557299
219  metoo    patriotic  0.563261
125  metoo      hateful  0.564394
(423, 3)
        word         adj      dist
219  america   patriotic  0.615573
181  america  courageous  0.631666
416  america      modern  0.632552
375  america       proud  0.637101
286  america   oppressed  0.638124
(423, 3)
    word        adj      dist
296  tax   balanced  0.518489
270  tax     modest  0.559856
379  tax      solid  0.590156
48   tax  deceitful  0.622806
273  tax  ambitious  0.643348
(423, 3)
         word           adj      dist
409  abortion     religious  0.391475
347  abortion        strict  0.452998
346  abortion       neutral  0.490430
334  abortion   progressive  

0.66072565

In [None]:
#importing google news word2vec



for word in testing_words:
  gn_dists = []
  for ind in range(len(adjectives)):
    adj = adjectives[ind]
    try:
      dist = gn_model.wv.distance(word, adj)
      gn_dists.append([word, adj, dist])
    except:
      gn_dists.append([word, adj, None])
    

  df = pd.DataFrame(gn_dists, columns=['word', 'adj', 'dist'])

    
  print(df.shape)

  df_sorted = df.sort_values(by='dist', ascending=True)
  print(df_sorted.head(5))


SyntaxError: ignored