In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import accuracy_score, classification_report, f1_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn, sentiwordnet as swn
from token_embeddings import generate_embeddings
from tokenize_clean_text import clean_text

# Instantiate the WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()


Generate Embeddings

In [2]:
#generate embeddings for our corpus, makes a file project_embeddings.csv and stores it in directory
generate_embeddings("All_output.csv")

Found 400000 word vectors in glove dict
Found 5338 unique tokens in corpus
Number of embeddings from corpus generated: 5113


In [3]:
embeddings_df = pd.read_csv("project_embeddings.csv", index_col=0)

Clustering Function

In [4]:
#clustering

def cluster_embeddings(df, num_of_clusters):
    
    kmeans = KMeans(n_clusters = num_of_clusters, random_state=42).fit(df)
    group_num = kmeans.labels_
    geo_centroids = kmeans.cluster_centers_
    
    #assign nearest word to geometric centroid in embedding space as centroid
    # find the index of the closest points from x to each class centroid
    close = pairwise_distances_argmin_min(geo_centroids, df, metric='euclidean')
    index_closest_points = close[0]
    word_centroids = df.iloc[index_closest_points].index
    
    #create dict of group number and centroids
    centroid_dict = {}
    for i in range(len(index_closest_points)):
        centroid_dict[i] = word_centroids[i]
    
    #create a dictionary of word and corresponding centroid
    
    #replace each label(group number) assigned by kmeans cluster algo with centroid word
    cen = [centroid_dict.get(group) for group in group_num]

    #create a dictionary
    word_centroid_dict = {}

    for i in range(df.shape[0]):
        word_centroid_dict[df.index[i]] = cen[i]
    
    return word_centroid_dict
    

Replace words in reviews with their cluster centroids and then calculate score

In [5]:
def replace_with_centroids(review, word_centroid_dict):
    
    new_review = [word_centroid_dict.get(word) if word in word_centroid_dict else word for word in review]
    
    return new_review    
    
    

def swn_classifier(review):
 
    sentiment = 0.0
    tokens_count = 0.0
    
    #Calculating score
    for word in review:
        
        lemma = lemmatizer.lemmatize(word)
        if not lemma:
            continue
 
        synsets = wn.synsets(lemma)
        if not synsets:
            continue
        
        # Take the first synset, the most common
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        print()
 
        #sentiment is the difference between positive and negative score
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
 
    # Default: neither positive, nor negative
    if not tokens_count:
        return 0
 
    return sentiment

Calling the cluster function to generate a word centroid dictionary. Num is a hyperparameter to be adjusted for best accuracy

In [6]:
#calling the cluster function to generate a word centroid dictionary
num = 2000   #roughly 11 words per cluster
word_centroid_dict = cluster_embeddings(embeddings_df, num)
    

Import data file, process it(sentiment classification) and export it in required format

In [7]:
#import reviews file
#use both review title and review content columns to predict score
reviews = pd.read_csv("All_output.csv")

#arrays to store list of tokens, replaced words, scores
clean_tokens = []
replaced_tokens = []
y_predicted = []


#calling the main calculate function
for review in reviews["text"]:    
    clean_t = clean_text(review)
    clean_tokens.append(clean_t)
    
    replaced_t = replace_with_centroids(clean_t, word_centroid_dict)
    replaced_tokens.append(replaced_t)
    
    senti_score = swn_classifier(replaced_t)
    y_predicted.append(senti_score)

    
#Classify reviews according to setiment score assigned
#1 : positive, 0 : neutral, -1 : negative 
y_classified = []
for i in y_predicted:
    if i > 0:
        y_classified.append(1)
    elif i<0:
        y_classified.append(-1)
    elif i==0:
        y_classified.append(0)
        

#appending cols in df
reviews["tokens"] = clean_tokens
reviews["replaced_centroids"] = replaced_tokens
reviews["sentiment_score"] = y_predicted
reviews["predicted_sentiment"] = y_classified

#exporting df
reviews.to_csv("classified_full_review_embeddings.csv", header=True)
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [8]:
reviews.head()

Unnamed: 0,created_at,text,after_clean_text,score,tokens,replaced_centroids,sentiment_score,predicted_sentiment
0,Mon Apr 20 15:02:06 +0000 2020,RT @SmartEnergyW: As we count down to celebrat...,count celebrate anniversary time remind everyo...,0.041667,"[count, celebrate, anniversary, time, remind, ...","[count, celebration, celebration, time, remind...",0.25,1
1,Mon Apr 20 15:02:02 +0000 2020,☕️𝘊𝘢𝘧é 𝘊𝘰𝘯 𝘕𝘦𝘸𝘴: https://t.co/AkXvfWDRhn via @...,hot beverage 𝘊𝘰𝘯 𝘕𝘦𝘸𝘴 via absolute realest gam...,0.0,"[hot, beverage, 𝘊𝘰𝘯, 𝘕𝘦𝘸𝘴, via, absolute, real...","[cool, drinks, 𝘊𝘰𝘯, 𝘕𝘦𝘸𝘴, via, absolute, woodl...",-0.375,-1
2,Mon Apr 20 15:01:45 +0000 2020,Are our governments and international institut...,governments international institutions incompe...,-0.229167,"[governments, international, institutions, inc...","[governments, international, institutions, hyp...",-1.0,-1
3,Mon Apr 20 15:01:35 +0000 2020,If it's about bringing smiles to the world aga...,bringing smiles world let us rekindle whole wo...,0.03125,"[bringing, smiles, world, let, us, rekindle, w...","[turning, smiling, europe, sure, us, forestall...",0.25,1
4,Mon Apr 20 15:01:23 +0000 2020,RT @FreddyBeltranP: Good Morning ☀️\r\n T...,good morning sun twitter planet globe meridian...,0.0625,"[good, morning, sun, twitter, planet, globe, m...","[sure, afternoon, sun, twitter, earth, globe, ...",-0.25,-1


In [9]:
reviews.to_csv("op_embeddings_score.csv", header=True)

In [16]:
op_word_embed.head()

Unnamed: 0,text,tokens,replaced_centroids,sentiment_score,predicted_sentiment
0,RT @FreddyBeltranP: Good Night 🌙\r\n T...,"[good, night, crescent, moon, twitter, planet,...","[better, night, crescent, mars, youtube, earth...",0.5,1
1,"@priyamenon96 Great interview, though I do dis...","[great, interview, though, disagree, use, palm...","[greatest, asked, however, consider, using, be...",0.5,1
2,RT @QUBEcc: Do you spend half of your day trav...,"[spend, half, day, travelling, work, worked, m...","[paying, second, days, traveling, work, work, ...",-0.75,-1
3,#RECYCLING TIP OF THE DAY | Be more eco-friend...,"[replace, liquid, hand, wash, soap, packed, ca...","[replacing, liquid, hand, washing, soap, packe...",0.0,0
4,RT @ResistsPotus: #SaveThePlanet\r\n\r\nIn wit...,"[green, new, deal, old, orange, fart]","[brown, current, proposal, man, orange, fucking]",-0.375,-1


In [None]:
accuracy_score(reviews["true_sentiment"], y_classified)

In [None]:
f1_score(reviews["true_sentiment"], y_classified, average=None)

In [None]:
#Some examples:
for n in range(0,100,20):
    print("user_rating:",reviews["rating"][n] )
    print("review:",reviews["full_review"][n])
    print("tokens:",reviews["tokens"][n])
    print("replaced centroids:",reviews["replaced_centroids"][n])
    print("sentiment_score:",reviews["sentiment_score"][n],'\n')