In [238]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [239]:
final_file = pd.read_csv('preprocessing_and_embeddings/data_kaggle_train.csv')

In [240]:
sentiment_map = pd.read_csv('KMeans_clustering//sentiment_dictionary_eng.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:

In [241]:
file_weighting = final_file.copy()

In [242]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)

Replacing words in sentences with their tfidf scores

In [243]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features) 
    return list(map(lambda y:dictionary[f'{y}'], x.title.lower().split()))

In [244]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 2.74 s


Replacing words in sentences with their sentiment score

In [245]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [246]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

Merging both previous steps and getting the predictions:

In [247]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate<0).astype('float')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [248]:
replacement_df

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment,sentiment_rate,prediction
0,"[-7.64017338550016, 0.2555467101699501, 0.5549...","[8.839131648274332, 6.8932214992190195, 6.1310...",deeds reason earthquake may allah forgive us,0,-68.473572,1.0
1,"[0.5135460528229118, 0.2315743788511709, 0.220...","[5.79460921055091, 4.461280385010932, 5.912392...",forest fire near la range ask canada,0,-21.648932,1.0
2,"[-1.6739370890911744, 0.10924603809551812, -1....","[7.858302395262607, 7.740519359606223, 16.2919...",residents asked shelter place notified officer...,0,-76.127170,1.0
3,"[-6.320735194659727, 0.1216853356327639, -1.07...","[8.328306024508343, 4.675053748037557, 8.83913...",13000 people receive wildfire evacuation order...,0,-61.576397,1.0
4,"[0.10188725847049758, 0.2973158272932801, 0.63...","[5.093556850483852, 7.298686607327184, 6.15355...",got sent photo ruby alaska smoke wildfire pour...,0,-57.292166,1.0
...,...,...,...,...,...,...
7608,"[0.14268399429993098, 0.8193858569753033, -0.9...","[5.373395745474607, 6.8932214992190195, 7.3727...",two giant cranes holding bridge collapse nearb...,0,15.135269,0.0
7609,"[-4.851248496140579, -8.10573345735835, 0.4925...","[8.839131648274332, 8.839131648274332, 6.71886...",ariaahrary thetawniest control wild fires cali...,0,-122.614244,1.0
7610,"[-2.2665357004994946, -3.3193030040177414, -4....","[8.145984467714388, 8.551449575822552, 8.83913...",m4 0104 utc5km volcano hawaii,0,-75.263248,1.0
7611,"[0.2255256890276889, 0.8306143036133936, 0.731...","[5.07020948648686, 7.165155214702661, 16.65661...",police investigating bike collided car little ...,0,-5.006652,1.0


In [256]:
final_file["sentiment_prediction"] = replacement_df["prediction"]

In [258]:
final_file = final_file.drop("rate",axis=1)
final_file.to_csv('data_sentiment_prediction.csv', index=False)

In [255]:
final_file

Unnamed: 0,title,rate,sentiment_prediction
0,deeds reason earthquake may allah forgive us,0,1.0
1,forest fire near la range ask canada,0,1.0
2,residents asked shelter place notified officer...,0,1.0
3,13000 people receive wildfire evacuation order...,0,1.0
4,got sent photo ruby alaska smoke wildfire pour...,0,1.0
...,...,...,...
7608,two giant cranes holding bridge collapse nearb...,0,0.0
7609,ariaahrary thetawniest control wild fires cali...,0,1.0
7610,m4 0104 utc5km volcano hawaii,0,1.0
7611,police investigating bike collided car little ...,0,1.0


In [254]:
final_file.to_csv('data_sentiment_prediction.csv', index=False)

In [249]:
sum(replacement_df['prediction'].tolist()),len((replacement_df['prediction'].tolist()))

(6321.0, 7613)

Reporting model's metrics

In [250]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,689,2582
1,603,3739



 
 Scores


Unnamed: 0,scores
accuracy,0.581637
precision,0.59152
recall,0.861124
f1,0.701304
