In [1]:
#First we load our data and check them out
import pandas as pd

df_train_pos = pd.read_csv("Copy of train_pos.csv", encoding = "UTF-8")
df_train_neg = pd.read_csv("Copy of train_neg.csv", encoding = "UTF-8")
df_test_pos = pd.read_csv("Copy of test_pos.csv", encoding = "UTF-8")
df_test_neg = pd.read_csv("Copy of test_neg.csv", encoding = "UTF-8")
print(df_train_pos)
print(df_train_neg)
print(df_test_pos)
print(df_test_neg)

       Unnamed: 0     ID                                               text  \
0               0      0  Bromwell High is a cartoon comedy. It ran at t...   
1               1  10000  Homelessness (or Houselessness as George Carli...   
2               2  10001  Brilliant over-acting by Lesley Ann Warren. Be...   
3               3  10002  This is easily the most underrated film inn th...   
4               4  10003  This is not the typical Mel Brooks film. It wa...   
...           ...    ...                                                ...   
12495       12495   9998  Seeing as the vote average was pretty low, and...   
12496       12496   9999  The plot had some wretched, unbelievable twist...   
12497       12497    999  I am amazed at how this movie(and most others ...   
12498       12498     99  A Christmas Together actually came before my t...   
12499       12499      9  Working-class romantic drama from director Mar...   

       rating  
0           9  
1           8  
2  

In [2]:
#Now we remove extra columns and just keep their text

df_train_pos = df_train_pos.drop('rating', axis=1)
df_train_pos = df_train_pos.drop('ID', axis=1)
df_train_pos = df_train_pos.drop('Unnamed: 0', axis=1)

df_train_neg = df_train_neg.drop('rating', axis=1)
df_train_neg = df_train_neg.drop('ID', axis=1)
df_train_neg = df_train_neg.drop('Unnamed: 0', axis=1)

df_test_pos = df_test_pos.drop('rating', axis=1)
df_test_pos = df_test_pos.drop('ID', axis=1)
df_test_pos = df_test_pos.drop('Unnamed: 0', axis=1)

df_test_neg = df_test_neg.drop('rating', axis=1)
df_test_neg = df_test_neg.drop('ID', axis=1)
df_test_neg = df_test_neg.drop('Unnamed: 0', axis=1)

print(df_train_pos)
print(df_train_neg)
print(df_test_pos)
print(df_test_neg)

                                                    text
0      Bromwell High is a cartoon comedy. It ran at t...
1      Homelessness (or Houselessness as George Carli...
2      Brilliant over-acting by Lesley Ann Warren. Be...
3      This is easily the most underrated film inn th...
4      This is not the typical Mel Brooks film. It wa...
...                                                  ...
12495  Seeing as the vote average was pretty low, and...
12496  The plot had some wretched, unbelievable twist...
12497  I am amazed at how this movie(and most others ...
12498  A Christmas Together actually came before my t...
12499  Working-class romantic drama from director Mar...

[12500 rows x 1 columns]
                                                    text
0      Story of a man who has unnatural feelings for ...
1      Airport '77 starts as a brand new luxury 747 p...
2      This film lacked something I couldn't put my f...
3      Sorry everyone,,, I know this is supposed to b...
4    

In [3]:
# Now we merge our train data and our test data together and also add
# a sentiment column as our label

df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = -1
df_test_pos['sentiment'] = 1
df_test_neg['sentiment'] = -1

df_train = pd.concat([df_train_pos, df_train_neg], ignore_index=True)
df_test = pd.concat([df_test_pos, df_test_neg], ignore_index=True)

print(df_train)
print(df_test)

import gc

del df_train_pos
del df_train_neg
del df_test_pos
del df_test_neg

gc.collect()

                                                    text  sentiment
0      Bromwell High is a cartoon comedy. It ran at t...          1
1      Homelessness (or Houselessness as George Carli...          1
2      Brilliant over-acting by Lesley Ann Warren. Be...          1
3      This is easily the most underrated film inn th...          1
4      This is not the typical Mel Brooks film. It wa...          1
...                                                  ...        ...
24995  Towards the end of the movie, I felt it was to...         -1
24996  This is the kind of movie that my enemies cont...         -1
24997  I saw 'Descent' last night at the Stockholm Fi...         -1
24998  Some films that you pick up for a pound turn o...         -1
24999  This is one of the dumbest films, I've ever se...         -1

[25000 rows x 2 columns]
                                                    text  sentiment
0      I went and saw this movie last night after bei...          1
1      Actor turned di

16

In [4]:
#Now we develop a tokenizer function so we use it both on train and test documents

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
porter_stemmer = PorterStemmer()

def tokenizer(document):
    # First we lower all our words
    document = document.lower()

    # Now we replace shortened forms back to their original form
    shortened_words_dic = {
        '\n' : ' ',
        'i.e.': 'that is',
        'e.g.': 'for example',
        'won’t': ' will not',
        'can’t': ' can not',
        '’ve': ' have',
        '’m': ' am',
        '’d': ' would',
        '’ll': ' will',
        '’s': ' is',
        '’re': ' are',
        'n’t': ' not'
    }
    for key, value in shortened_words_dic.items():
        document = document.replace(key, value)

    # Now we tokenize it with nltk word tokenizer
    tokenized_document = word_tokenize(document)

    # Now we remove all punctuation marks or tokens made of only punctuation marks from our tokens
    bad_tokens = [',', '?', '”', ')', '’', ';', '“', '.', '$', '(', ':', '!', '/', '-', '[', ']', '{', '}', "'"]
    tokenized_document = [
        token for token in tokenized_document if token not in bad_tokens]

    new_tokenized_document = []
    for token in tokenized_document:
        flag = False
        for char in token:
            if(char not in bad_tokens):
                flag = True
                break
        if flag:
            new_tokenized_document.append(token)
    tokenized_document = new_tokenized_document

    # Now we remove punctuation marks that are left in the middle or at the end of tokens
    tokenized_document = [token.rstrip('.') for token in tokenized_document]
    tokenized_document = [token.rstrip('?') for token in tokenized_document]
    tokenized_document = [token.rstrip('—') for token in tokenized_document]
    tokenized_document = [token.rstrip('-') for token in tokenized_document]
    tokenized_document = [token.rstrip('–') for token in tokenized_document]
    tokenized_document = [token.rstrip('/') for token in tokenized_document]

    new_tokenized_document = []
    for token in tokenized_document:
        if '.' in token:
            cur = token.split('.')
            for i in range(len(cur)):
                if(i + 1 == len(cur)):
                    new_tokenized_document.append(cur[i])
                else:
                    if(len(cur[i]) == 0):
                        continue
                    if(cur[i][0] <= '9' and len(cur[i + 1]) > 0 and cur[i + 1][0] <= '9'):
                        cur[i + 1] = cur[i] + cur[i + 1]
                        continue
                    new_tokenized_document.append(cur[i])
        elif '?' in token:
            new_tokenized_document.extend(token.split('?'))
        elif '-' in token:
            new_tokenized_document.extend(token.split('-'))
        elif '—' in token:
            new_tokenized_document.extend(token.split('—'))
        elif '–' in token:
            new_tokenized_document.extend(token.split('–'))
        elif "'" in token:
            new_tokenized_document.extend(token.split("'"))
        elif "`" in token:
            new_tokenized_document.extend(token.split("`"))
        elif "/" in token:
            new_tokenized_document.extend(token.split("/"))
        elif "<" in token:
            new_tokenized_document.extend(token.split("<"))
        elif ">" in token:
            new_tokenized_document.extend(token.split(">"))
        else:
            new_tokenized_document.append(token)
    tokenized_document = new_tokenized_document

    #Now we remove empty tokens generated from filtering punctuation marks
    tokenized_document = [
        token for token in tokenized_document if len(token) != 0]

    # Now we remove stop words from it
    stop_words = ['he', 'for', 'in', 'is',
                  'was', 'of', 'and', 'to', 'a', 'the']
    tokenized_document = [
        token for token in tokenized_document if token not in stop_words]

    # Now its time to convert our tokens to their stemmed forms
    stemmed_tokenized_document = [porter_stemmer.stem(
        token) for token in tokenized_document]

    # Now that we are done with our tokenizing and preprocessing the tokens, it's time to return them
    return stemmed_tokenized_document

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ashkan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
#Now we tokenize all our train data and find out how many term do we have
all_tokens = []
for i in range(df_train.shape[0]):
  all_tokens.extend(tokenizer(df_train.iloc[i]['text']))

terms = list(set(all_tokens))
print(len(terms))

52885


In [6]:
#Now because our tf_idf matrice will get very sparce
#we count each terms frequency and filter very sparce terms
tf = {}
for token in all_tokens:
  if(token in tf.keys()):
    tf[token] += 1
  else:
    tf[token] = 1

tf_term = [(tf[term], term) for term in terms]
tf_term.sort()

In [7]:
print(tf_term[-2000])

(248, 'section')


In [8]:
#As we have seen first 54000 terms are very sparce and they won't help our classification
#so we filter them out
terms = [tf_term[-(i + 1)][1] for i in range(2000)]
print(terms)

del tf_term
gc.collect()

['it', 'br', 'i', 'thi', 'that', 's', 'movi', 'film', 'as', 'with', 'but', 't', 'you', 'on', 'be', 'n', 'not', 'have', 'are', 'hi', 'one', 'all', 'at', 'they', 'like', 'by', 'an', 'who', 'so', 'from', 'do', 'there', 'her', 'or', 'just', 'about', 'ha', 'out', 'if', 'what', 'time', 'some', 'good', 'make', 'more', 'she', 'when', 'charact', 'get', 'see', 'veri', 'watch', 'up', 'would', 'stori', 'even', 'no', 'my', 'can', 'which', 'onli', 'realli', 'had', 'their', 'were', 'well', 'we', 'me', 'other', 'scene', 'did', 'doe', 'look', 'than', 'show', 'much', 'end', 'will', 'could', 'peopl', 'bad', 'go', 'been', 'great', 'also', 'into', 'first', 'becaus', 'love', 'think', 'how', 'him', 'way', 'act', 'most', 'play', 'made', 'thing', 'then', 'them', 'too', 'ani', 'after', 'know', 'say', 'seem', 'work', '*', 'plot', 'two', 'actor', 'year', 'come', 'mani', 'seen', 'take', 'life', 'want', 'never', 'littl', 'best', 'where', 'over', 'tri', 'off', 'man', 'ever', 'here', 'give', 'better', 'your', 'still'

32

In [9]:
# Now before implementing tf-idfs we first need
# a hash function to get termID from each term
all_tokens_hash = len(terms) * ['#']

def hash_func(token):
    B = 256
    M = len(terms)
    cur = 0
    for i in range(len(token)):
        cur = ((cur * B) + ord(token[i])) % M
    while (all_tokens_hash[cur] != token and all_tokens_hash[cur] != '#'):
        cur = (cur + 1) % M
    if (all_tokens_hash[cur] == '#'):
        all_tokens_hash[cur] = token
    return cur

In [10]:
#Now it's time to build the tf-idf matrice for our train and test data

#first we build the tf matrice
tf_idf_mat_train = [[0 for i in range(len(terms))] for j in range(df_train.shape[0])]
tf_idf_mat_test = [[0 for i in range(len(terms))] for j in range(df_test.shape[0])]

for i in range(df_train.shape[0]):
  tokenized_document = tokenizer(df_train.iloc[i]['text'])
  for token in tokenized_document:
    if token not in terms:
      continue
    tf_idf_mat_train[i][hash_func(token)] += 1

for i in range(df_test.shape[0]):
  tokenized_document = tokenizer(df_test.iloc[i]['text'])
  for token in tokenized_document:
    if token not in terms:
      continue
    tf_idf_mat_test[i][hash_func(token)] += 1    

In [11]:
#second we calculate idf for each term and multipy each tf matrice row by its value
#and at last we save it as a dataframe

N = df_train.shape[0]
idf_mat = [N / sum([(tf_idf_mat_train[j][i] > 0) for j in range(df_train.shape[0])])
 for i in range(len(terms))]

for i in range(df_train.shape[0]):
  for j in range(len(terms)):
    tf_idf_mat_train[i][j] *= idf_mat[j]
    
for i in range(df_test.shape[0]):
  for j in range(len(terms)):
    tf_idf_mat_test[i][j] *= idf_mat[j]

df_tf_idf_train = pd.DataFrame(tf_idf_mat_train)
df_tf_idf_train.to_csv('df_tf_idf_train.csv', index=False)

df_tf_idf_test = pd.DataFrame(tf_idf_mat_test)
df_tf_idf_test.to_csv('df_tf_idf_test.csv', index=False)

print(df_tf_idf_train)

del tf_idf_mat_train
del tf_idf_mat_test
del idf_mat
gc.collect()

       0     1     2     3     4         5     6     7     8     9     ...  \
0       0.0   0.0   0.0   0.0   0.0  6.801796   0.0   0.0   0.0   0.0  ...   
1       0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
2       0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
3       0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
4       0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
...     ...   ...   ...   ...   ...       ...   ...   ...   ...   ...  ...   
24995   0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
24996   0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   
24997   0.0   0.0   0.0   0.0   0.0  3.400898   0.0   0.0   0.0   0.0  ...   
24998   0.0   0.0   0.0   0.0   0.0  3.400898   0.0   0.0   0.0   0.0  ...   
24999   0.0   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0  ...   

             1990       1991  1992  1993  1994  1995  1996  199

0

In [12]:
#Now its time to do our naiive bayes algorithm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_train = df_train['sentiment']
y_test = df_test['sentiment']

# Creating a Gaussian Naive Bayes classifier
NB = GaussianNB()

NB.fit(df_tf_idf_train, y_train)

y_pred_train = NB.predict(df_tf_idf_train)
y_pred_test = NB.predict(df_tf_idf_test)


accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Accuracy train: {accuracy_train:.2f}")
print(f"Precision train: {precision_train:.2f}")
print(f"Recall train: {recall_train:.2f}")
print(f"F1-score train: {f1_score_train:.2f}")

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Accuracy test: {accuracy_test:.2f}")
print(f"Precision test: {precision_test:.2f}")
print(f"Recall test: {recall_test:.2f}")
print(f"F1-score test: {f1_score_test:.2f}")

Accuracy train: 0.76
Precision train: 0.84
Recall train: 0.64
F1-score train: 0.73
Accuracy test: 0.72
Precision test: 0.81
Recall test: 0.58
F1-score test: 0.68


In [13]:
del df_tf_idf_train
del df_tf_idf_test
gc.collect()

16

In [14]:
#First we learn our W2V model
from gensim.models import Word2Vec

tokenized_documents = [tokenizer(df_train.iloc[i]['text']) for i in range(df_train.shape[0])]
model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, sg=0, min_count=1)
model.train(tokenized_documents, total_examples=len(tokenized_documents), epochs=10)

(40693357, 48275570)

In [15]:
#Now we get vectors of all of our terms and set their mean to be our representive vector
import numpy as np

vw_embeded_train = np.empty(shape=(0, 100))

for i in range(df_train.shape[0]):
    tokenized_document = tokenizer(df_train.iloc[i]['text'])
    cur = np.zeros(100)
    cnt = 0
    for token in tokenized_document:
        if token not in terms:
            continue
        
        cnt += 1
        cur += model.wv[token]
        
    if(cnt != 0):
        cur /= cnt
    
    vw_embeded_train = np.concatenate((vw_embeded_train, [cur]))
    
vw_embeded_test = np.empty(shape=(0, 100))

for i in range(df_test.shape[0]):
    tokenized_document = tokenizer(df_test.iloc[i]['text'])
    cur = np.zeros(100)
    cnt = 0
    for token in tokenized_document:
        if token not in terms:
            continue
        
        cnt += 1
        cur += model.wv[token]
        
    if(cnt != 0):
        cur /= cnt
    
    vw_embeded_test = np.concatenate((vw_embeded_test, [cur]))

print(vw_embeded_train.shape)
print(vw_embeded_test.shape)

(25000, 100)
(25000, 100)


In [16]:
#Now its time to train our svm model on our W2V data
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0)

svm.fit(vw_embeded_train, y_train)

y_pred_train = svm.predict(vw_embeded_train)
y_pred_test = svm.predict(vw_embeded_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Accuracy train: {accuracy_train:.2f}")
print(f"Precision train: {precision_train:.2f}")
print(f"Recall train: {recall_train:.2f}")
print(f"F1-score train: {f1_score_train:.2f}")

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Accuracy test: {accuracy_test:.2f}")
print(f"Precision test: {precision_test:.2f}")
print(f"Recall test: {recall_test:.2f}")
print(f"F1-score test: {f1_score_test:.2f}")

Accuracy train: 0.83
Precision train: 0.83
Recall train: 0.84
F1-score train: 0.83
Accuracy test: 0.83
Precision test: 0.83
Recall test: 0.82
F1-score test: 0.83


In [17]:
#Now we use LSA for our second embedding
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=30)

vw_train_lsa = lsa.fit_transform(vw_embeded_train)
vw_test_lsa = lsa.transform(vw_embeded_test)

del vw_embeded_train
del vw_embeded_test
gc.collect()

0

In [18]:
#Now its time to train our svm model
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0)

svm.fit(vw_train_lsa, y_train)

y_pred_train = svm.predict(vw_train_lsa)
y_pred_test = svm.predict(vw_test_lsa)

accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Accuracy train: {accuracy_train:.2f}")
print(f"Precision train: {precision_train:.2f}")
print(f"Recall train: {recall_train:.2f}")
print(f"F1-score train: {f1_score_train:.2f}")

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Accuracy test: {accuracy_test:.2f}")
print(f"Precision test: {precision_test:.2f}")
print(f"Recall test: {recall_test:.2f}")
print(f"F1-score test: {f1_score_test:.2f}")

Accuracy train: 0.81
Precision train: 0.81
Recall train: 0.82
F1-score train: 0.81
Accuracy test: 0.81
Precision test: 0.82
Recall test: 0.81
F1-score test: 0.81


In [26]:
#First we load our GloVe model
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-300")



In [40]:
#And we get vectors of all of our terms and set their mean to be our representive vector
GV_embeded_train = np.empty(shape=(0, 300))

for i in range(df_train.shape[0]):
    tokenized_document = tokenizer(df_train.iloc[i]['text'])
    cur = np.zeros(300)
    cnt = 0
    for token in tokenized_document:
        if token not in glove_model.key_to_index:
            continue
        
        cnt += 1
        cur += glove_model[token]
        
    if(cnt != 0):
        cur /= cnt
    
    GV_embeded_train = np.concatenate((GV_embeded_train, [cur]))
    
GV_embeded_test = np.empty(shape=(0, 300))

for i in range(df_test.shape[0]):
    tokenized_document = tokenizer(df_test.iloc[i]['text'])
    cur = np.zeros(300)
    cnt = 0
    for token in tokenized_document:
        if token not in glove_model.key_to_index:
            continue
        
        cnt += 1
        cur += glove_model[token]
        
    if(cnt != 0):
        cur /= cnt
    
    GV_embeded_test = np.concatenate((GV_embeded_test, [cur]))

print(GV_embeded_train.shape)
print(GV_embeded_test.shape)

(25000, 300)
(25000, 300)


In [41]:
#Now its time to train our svm model on our W2V data
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0)

svm.fit(GV_embeded_train, y_train)

y_pred_train = svm.predict(GV_embeded_train)
y_pred_test = svm.predict(GV_embeded_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Accuracy train: {accuracy_train:.2f}")
print(f"Precision train: {precision_train:.2f}")
print(f"Recall train: {recall_train:.2f}")
print(f"F1-score train: {f1_score_train:.2f}")

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Accuracy test: {accuracy_test:.2f}")
print(f"Precision test: {precision_test:.2f}")
print(f"Recall test: {recall_test:.2f}")
print(f"F1-score test: {f1_score_test:.2f}")

Accuracy train: 0.81
Precision train: 0.80
Recall train: 0.81
F1-score train: 0.81
Accuracy test: 0.80
Precision test: 0.80
Recall test: 0.80
F1-score test: 0.80


In [43]:
del vw_train_lsa
del vw_test_lsa
del GV_embeded_train
del GV_embeded_test
del glove_model
gc.collect()

2836

In [53]:
#First we train our FastText model
from gensim.models import FastText

tokenized_documents = [tokenizer(df_train.iloc[i]['text']) for i in range(df_train.shape[0])]
fasttext_model = FastText(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

In [54]:
#And we get vectors of all of our terms and set their mean to be our representive vector
FT_embeded_train = np.empty(shape=(0, 100))

for i in range(df_train.shape[0]):
    tokenized_document = tokenizer(df_train.iloc[i]['text'])
    cur = np.zeros(100)
    cnt = 0
    for token in tokenized_document:
        if token not in terms:
            continue
        
        cnt += 1
        cur += fasttext_model.wv[token]
        
    if(cnt != 0):
        cur /= cnt
    
    FT_embeded_train = np.concatenate((FT_embeded_train, [cur]))
    
FT_embeded_test = np.empty(shape=(0, 100))

for i in range(df_test.shape[0]):
    tokenized_document = tokenizer(df_test.iloc[i]['text'])
    cur = np.zeros(100)
    cnt = 0
    for token in tokenized_document:
        if token not in terms:
            continue
        
        cnt += 1
        cur += fasttext_model.wv[token]
        
    if(cnt != 0):
        cur /= cnt
    
    FT_embeded_test = np.concatenate((FT_embeded_test, [cur]))

print(FT_embeded_train.shape)
print(FT_embeded_test.shape)

(25000, 100)
(25000, 100)


In [55]:
#Now its time to train our svm model on our W2V data
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0)

svm.fit(FT_embeded_train, y_train)

y_pred_train = svm.predict(FT_embeded_train)
y_pred_test = svm.predict(FT_embeded_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_score_train = f1_score(y_train, y_pred_train)

print(f"Accuracy train: {accuracy_train:.2f}")
print(f"Precision train: {precision_train:.2f}")
print(f"Recall train: {recall_train:.2f}")
print(f"F1-score train: {f1_score_train:.2f}")

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_score_test = f1_score(y_test, y_pred_test)

print(f"Accuracy test: {accuracy_test:.2f}")
print(f"Precision test: {precision_test:.2f}")
print(f"Recall test: {recall_test:.2f}")
print(f"F1-score test: {f1_score_test:.2f}")

Accuracy train: 0.81
Precision train: 0.80
Recall train: 0.81
F1-score train: 0.81
Accuracy test: 0.80
Precision test: 0.81
Recall test: 0.80
F1-score test: 0.80
