In [195]:
import re 
import nltk
import torch
import spacy
import gensim
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from rank_bm25 import BM25Okapi, BM25L
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/it012311/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/it012311/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/it012311/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/it012311/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
text_df = pd.read_csv('IMDB Dataset.csv')
text_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def preprossing(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

text_df['review'] = text_df['review'].apply(preprossing)
text_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
def convert_lower(text):
    return text.lower()
text_df['review'] = text_df['review'].apply(convert_lower)
text_df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [6]:
def tokenize(text):
    # Tokenize the cleaned text
    return nltk.word_tokenize(text)
text_df['tokens'] = text_df['review'].apply(tokenize)
text_df.head()

Unnamed: 0,review,sentiment,tokens
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ..."


In [7]:
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

text_df['lemmatized'] = text_df['tokens'].apply(lemmatize)
text_df.head()

Unnamed: 0,review,sentiment,tokens,lemmatized
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewer, ha, mentioned,..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil...","[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, this, wa, a, wonderful, way, to, ..."
3,basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li...","[basically, there, 's, a, family, where, a, li..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ...","[petter, mattei, 's, ``, love, in, the, time, ..."


In [8]:
def lemmatize(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return [lemmatizer.lemmatize(token) for token in filtered_tokens]
text_df['lemmatized'] = text_df['tokens'].apply(lemmatize)


In [9]:
text_df.head()

Unnamed: 0,review,sentiment,tokens,lemmatized
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil...","[wonderful, little, production, ., filming, te..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li...","[basically, 's, family, little, boy, (, jake, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ...","[petter, mattei, 's, ``, love, time, money, ''..."


In [10]:
text_df.drop(['review','tokens'],axis=1,inplace=True)
text_df.head()

Unnamed: 0,sentiment,lemmatized
0,positive,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,positive,"[wonderful, little, production, ., filming, te..."
2,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,negative,"[basically, 's, family, little, boy, (, jake, ..."
4,positive,"[petter, mattei, 's, ``, love, time, money, ''..."


Bag Of Words

In [12]:
text_df['lemmatized_str'] = text_df['lemmatized'].apply(lambda x: ' '.join(x))
text_df.head()

Unnamed: 0,sentiment,lemmatized,lemmatized_str
0,positive,"[one, reviewer, mentioned, watching, 1, oz, ep...",one reviewer mentioned watching 1 oz episode '...
1,positive,"[wonderful, little, production, ., filming, te...",wonderful little production . filming techniqu...
2,positive,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...
3,negative,"[basically, 's, family, little, boy, (, jake, ...",basically 's family little boy ( jake ) think ...
4,positive,"[petter, mattei, 's, ``, love, time, money, ''...",petter mattei 's `` love time money '' visuall...


In [13]:
vectorizer = CountVectorizer(max_features=500)
X = vectorizer.fit_transform(text_df['lemmatized_str'])


In [14]:
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())


In [15]:
X_df.head()

Unnamed: 0,10,able,absolutely,act,acting,action,actor,actress,actually,add,...,worth,would,writer,writing,written,wrong,year,yes,yet,young
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Binary feature

In [17]:
X_df_binary = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X_df_binary.head()


Unnamed: 0,10,able,absolutely,act,acting,action,actor,actress,actually,add,...,worth,would,writer,writing,written,wrong,year,yes,yet,young
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Count based feature

In [19]:
X_df_count = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X_df_count.head()


Unnamed: 0,10,able,absolutely,act,acting,action,actor,actress,actually,add,...,worth,would,writer,writing,written,wrong,year,yes,yet,young
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Frequency based feature

In [21]:
X_df_frequency = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
normalized_X_df = X_df_frequency.div(X_df_frequency.sum(axis=1), axis=0)
normalized_X_df

Unnamed: 0,10,able,absolutely,act,acting,action,actor,actress,actually,add,...,worth,would,writer,writing,written,wrong,year,yes,yet,young
0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.027397,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000
1,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.023256,0.0,0.0,0.0,...,0.023256,0.000000,0.0,0.0,0.023256,0.000000,0.000,0.0,0.0,0.000
2,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.025,0.0,0.0,0.025
3,0.028571,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000
4,0.000000,0.0,0.0,0.000000,0.013699,0.013699,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.018868,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000
49996,0.000000,0.0,0.0,0.000000,0.045455,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000
49997,0.000000,0.0,0.0,0.019231,0.019231,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.019231,0.0,0.0,0.000000,0.019231,0.000,0.0,0.0,0.000
49998,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.024390,0.0,0.0,0.000000,0.000000,0.000,0.0,0.0,0.000


TF-IDF

In [23]:
tfidf_vectorizer = TfidfVectorizer(max_features=500)  
X_tfidf = tfidf_vectorizer.fit_transform(text_df['lemmatized_str'])


In [24]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(X_tfidf_df.head())


         10  able  absolutely  act    acting    action     actor  actress  \
0  0.000000   0.0         0.0  0.0  0.000000  0.000000  0.000000      0.0   
1  0.000000   0.0         0.0  0.0  0.000000  0.000000  0.107318      0.0   
2  0.000000   0.0         0.0  0.0  0.000000  0.000000  0.000000      0.0   
3  0.134987   0.0         0.0  0.0  0.000000  0.000000  0.000000      0.0   
4  0.000000   0.0         0.0  0.0  0.074293  0.096532  0.000000      0.0   

   actually  add  ...     worth     would  writer  writing   written  wrong  \
0       0.0  0.0  ...  0.000000  0.109105     0.0      0.0  0.000000    0.0   
1       0.0  0.0  ...  0.142212  0.000000     0.0      0.0  0.159024    0.0   
2       0.0  0.0  ...  0.000000  0.000000     0.0      0.0  0.000000    0.0   
3       0.0  0.0  ...  0.000000  0.000000     0.0      0.0  0.000000    0.0   
4       0.0  0.0  ...  0.000000  0.000000     0.0      0.0  0.000000    0.0   

     year  yes  yet     young  
0  0.0000  0.0  0.0  0.000000 

In [25]:
X_tfidf_df

Unnamed: 0,10,able,absolutely,act,acting,action,actor,actress,actually,add,...,worth,would,writer,writing,written,wrong,year,yes,yet,young
0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.109105,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000
1,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.107318,0.0,0.0,0.0,...,0.142212,0.000000,0.0,0.0,0.159024,0.00000,0.0000,0.0,0.0,0.000000
2,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.1162,0.0,0.0,0.143128
3,0.134987,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000
4,0.000000,0.0,0.0,0.000000,0.074293,0.096532,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.122040,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000
49996,0.000000,0.0,0.0,0.000000,0.152238,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000
49997,0.000000,0.0,0.0,0.150025,0.097779,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.079623,0.0,0.0,0.000000,0.14469,0.0000,0.0,0.0,0.000000
49998,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.000000,0.096380,0.0,0.0,0.000000,0.00000,0.0000,0.0,0.0,0.000000


BM-25

In [27]:
bm25l = BM25L(text_df['lemmatized_str'])
query = "actor"
doc_scores = bm25l.get_scores(query)



In [28]:
doc_scores

array([0.12196485, 0.06372755, 0.04655762, ..., 0.07558058, 0.10282859,
       0.04563071])

n-grams

In [30]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=500)  
X_tfidf = tfidf_vectorizer.fit_transform(text_df['lemmatized_str'])


In [31]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_tfidf_df

Unnamed: 0,10 10,10 minute,10 year,15 minute,20 minute,20 year,30 minute,90 minute,absolutely nothing,academy award,...,would ve,would want,writer director,year ago,year later,year old,yet another,young girl,young man,young woman
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.496209
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
49996,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
49997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
49998,0.0,0.0,0.0,0.0,0.0,0.0,0.617854,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


Positional n gram

In [33]:
def Posngram(text,n):
    words = text.split()  
    positional_ngrams = []
    
    for i in range(len(words) - n + 1):
        ngram = [(words[i + j], i + j + 1) for j in range(n)]  
        positional_ngrams.append(' '.join([f"{word}_{pos}" for word, pos in ngram]))
        
    return positional_ngrams


In [34]:
n = 2
text_df['positional_bigrams'] = text_df['lemmatized_str'].apply(lambda x: Posngram(x, n))


In [35]:
text_df['positional_bigrams']

0        [one_1 reviewer_2, reviewer_2 mentioned_3, men...
1        [wonderful_1 little_2, little_2 production_3, ...
2        [thought_1 wonderful_2, wonderful_2 way_3, way...
3        [basically_1 's_2, 's_2 family_3, family_3 lit...
4        [petter_1 mattei_2, mattei_2 's_3, 's_3 ``_4, ...
                               ...                        
49995    [thought_1 movie_2, movie_2 right_3, right_3 g...
49996    [bad_1 plot_2, plot_2 ,_3, ,_3 bad_4, bad_4 di...
49997    [catholic_1 taught_2, taught_2 parochial_3, pa...
49998    ['m_1 going_2, going_2 disagree_3, disagree_3 ...
49999    [one_1 expects_2, expects_2 star_3, star_3 tre...
Name: positional_bigrams, Length: 50000, dtype: object

In [36]:
n = 3
text_df['positional_trigram'] = text_df['lemmatized_str'].apply(lambda x: Posngram(x, n))


In [37]:

text_df['positional_trigram']

0        [one_1 reviewer_2 mentioned_3, reviewer_2 ment...
1        [wonderful_1 little_2 production_3, little_2 p...
2        [thought_1 wonderful_2 way_3, wonderful_2 way_...
3        [basically_1 's_2 family_3, 's_2 family_3 litt...
4        [petter_1 mattei_2 's_3, mattei_2 's_3 ``_4, '...
                               ...                        
49995    [thought_1 movie_2 right_3, movie_2 right_3 go...
49996    [bad_1 plot_2 ,_3, plot_2 ,_3 bad_4, ,_3 bad_4...
49997    [catholic_1 taught_2 parochial_3, taught_2 par...
49998    ['m_1 going_2 disagree_3, going_2 disagree_3 p...
49999    [one_1 expects_2 star_3, expects_2 star_3 trek...
Name: positional_trigram, Length: 50000, dtype: object

**skipgram**

In [39]:
def tokenizer(text):
    tokens = word_tokenize(text)
    return tokens
    




In [40]:
tokens = text_df['lemmatized_str'].apply(tokenizer).tolist()
skipgram_model = Word2Vec(sentences=tokens, vector_size=100, window=2, min_count=20, sg=1, workers=4, epochs=10)


def sentence_to_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

text_df['tokens'] = text_df['lemmatized_str'].apply(tokenizer)
text_df['vector'] = text_df['tokens'].apply(lambda x: sentence_to_vector(x, skipgram_model))


In [41]:
text_df['vector']

0        [-0.090725444, 0.063403, 0.14552025, 0.0558352...
1        [0.021432318, 0.10225901, 0.10679602, 0.087396...
2        [0.0089241825, 0.061827864, 0.12811357, 0.0005...
3        [-0.008080406, 0.029984698, 0.13916501, 0.0614...
4        [-0.0021420056, 0.09271964, 0.12573533, 0.0462...
                               ...                        
49995    [0.056389377, 0.080036215, 0.130944, 0.0208751...
49996    [0.044070005, 0.11114829, 0.12752591, 0.088501...
49997    [-0.059719726, 0.06924403, 0.19710936, 0.08535...
49998    [-0.03079784, 0.06502045, 0.14937805, 0.127326...
49999    [0.0032684596, 0.1417239, 0.087981746, 0.04485...
Name: vector, Length: 50000, dtype: object

In [42]:
text_df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [43]:
label_encoder = LabelEncoder()
text_df['sentiment'] = label_encoder.fit_transform(text_df['sentiment'])
text_df.head()

Unnamed: 0,sentiment,lemmatized,lemmatized_str,positional_bigrams,positional_trigram,tokens,vector
0,1,"[one, reviewer, mentioned, watching, 1, oz, ep...",one reviewer mentioned watching 1 oz episode '...,"[one_1 reviewer_2, reviewer_2 mentioned_3, men...","[one_1 reviewer_2 mentioned_3, reviewer_2 ment...","[one, reviewer, mentioned, watching, 1, oz, ep...","[-0.090725444, 0.063403, 0.14552025, 0.0558352..."
1,1,"[wonderful, little, production, ., filming, te...",wonderful little production . filming techniqu...,"[wonderful_1 little_2, little_2 production_3, ...","[wonderful_1 little_2 production_3, little_2 p...","[wonderful, little, production, ., filming, te...","[0.021432318, 0.10225901, 0.10679602, 0.087396..."
2,1,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...,"[thought_1 wonderful_2, wonderful_2 way_3, way...","[thought_1 wonderful_2 way_3, wonderful_2 way_...","[thought, wonderful, way, spend, time, hot, su...","[0.0089241825, 0.061827864, 0.12811357, 0.0005..."
3,0,"[basically, 's, family, little, boy, (, jake, ...",basically 's family little boy ( jake ) think ...,"[basically_1 's_2, 's_2 family_3, family_3 lit...","[basically_1 's_2 family_3, 's_2 family_3 litt...","[basically, 's, family, little, boy, (, jake, ...","[-0.008080406, 0.029984698, 0.13916501, 0.0614..."
4,1,"[petter, mattei, 's, ``, love, time, money, ''...",petter mattei 's `` love time money '' visuall...,"[petter_1 mattei_2, mattei_2 's_3, 's_3 ``_4, ...","[petter_1 mattei_2 's_3, mattei_2 's_3 ``_4, '...","[petter, mattei, 's, ``, love, time, money, ``...","[-0.0021420056, 0.09271964, 0.12573533, 0.0462..."


**1.Chi Square test**

**2.Mutual importence**

**3.Recursive feature elemination**

In [110]:
sentiment_tensor = torch.tensor(text_df['sentiment'].values)

In [104]:
tf_idf_tensor    = torch.tensor(X_tfidf_df.values)

In [106]:
tf_idf_tensor

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4962],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [112]:
sentiment_tensor

tensor([1, 1, 1,  ..., 0, 0, 0])

In [122]:
print(sentiment_tensor.shape)
print(tf_idf_tensor.shape)


(50000,)
(50000, 500)


**Chi Square test of tf-idf**

In [126]:
from sklearn.feature_selection import chi2
import pandas as pd

chi2_stat, p_values = chi2(tf_idf_tensor , sentiment_tensor )

features = vectorizer.get_feature_names_out()

chi2_results = pd.DataFrame({'Feature': features, 'Chi2 Stat': chi2_stat, 'p-value': p_values})


chi2_results_sorted = chi2_results.sort_values(by='p-value')

print(chi2_results_sorted.head(10))

           Feature   Chi2 Stat       p-value
455  unfortunately  426.035570  1.185101e-94
302             oh  369.162717  2.847718e-82
476          white  340.110940  6.035453e-76
324        playing  281.760467  3.104302e-63
28         attempt  261.373354  8.613125e-59
291          never  231.546762  2.741728e-52
475           went  206.306027  8.786570e-47
0               10  203.689135  3.272019e-46
24        anything  198.772580  3.869781e-45
206        however  185.040429  3.845670e-42


**Chi-square test frequency**

In [143]:
frequency_tensor = torch.tensor(normalized_X_df.values)
frequency_tensor.shape

torch.Size([50000, 500])

In [145]:
from sklearn.feature_selection import chi2
import pandas as pd

chi2_stat, p_values = chi2(frequency_tensor , sentiment_tensor )

features = vectorizer.get_feature_names_out()

chi2_results = pd.DataFrame({'Feature': features, 'Chi2 Stat': chi2_stat, 'p-value': p_values})


chi2_results_sorted = chi2_results.sort_values(by='p-value')

print(chi2_results_sorted.head(10))

       Feature   Chi2 Stat       p-value
33         bad  130.004699  4.088088e-30
178      great   84.989980  2.998809e-20
489      worst   79.740956  4.268600e-19
469      waste   49.969100  1.561863e-12
31       awful   47.752236  4.836323e-12
132  excellent   41.950592  9.360902e-11
422   terrible   40.086961  2.429049e-10
258       love   36.630571  1.427750e-09
42        best   36.516262  1.513978e-09
50      boring   33.572628  6.865204e-09


**Mutual importence**

In [151]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

mutual_info = mutual_info_classif(frequency_tensor, sentiment_tensor)

features = vectorizer.get_feature_names_out()
mi_results = pd.DataFrame({'Feature': features, 'Mutual Information': mutual_info})

mi_results_sorted = mi_results.sort_values(by='Mutual Information', ascending=False)

print(mi_results_sorted.head(10))


       Feature  Mutual Information
33         bad            0.042295
489      worst            0.036885
31       awful            0.025878
469      waste            0.025158
178      great            0.024071
132  excellent            0.017291
50      boring            0.015808
422   terrible            0.014686
413     stupid            0.014193
415   supposed            0.013696


**model fitting**

In [165]:
X = frequency_tensor
y = sentiment_tensor
X_train, X_pred, y_train, y_pred = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_pred, y_pred, test_size = 0.5, random_state = 42)


In [179]:
model = svm.SVC(kernel='poly')

In [181]:
model.fit(X_train,y_train)

In [197]:
y_pred=model.predict(X_val)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

Model accuracy score with default hyperparameters: 0.8373


In [199]:
model_1 = svm.SVC(kernel='rbf')

In [201]:
model_1.fit(X_train,y_train)

In [202]:
y_pred_1=model_1.predict(X_val)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_val, y_pred_1)))

Model accuracy score with default hyperparameters: 0.8476


In [211]:
y_pred=model_1.predict(X_test)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.8483
