In [1]:
!pip install gensim




[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import gensim
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [2]:
text = pd.read_csv("IMDB Dataset.csv")

In [3]:
text.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
text["review_clean"] = text["review"].apply(lambda x: gensim.utils.simple_preprocess(x))
text.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, br, br, the, f..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, this, was, wonderful, way, to, spend..."
3,Basically there's a family where a little boy ...,negative,"[basically, there, family, where, little, boy,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone..."


In [5]:
clean_text = text[["review_clean"]]
labels = text["sentiment"]

In [6]:
len(clean_text)

50000

In [7]:
corpus = []
for review in clean_text["review_clean"]:
  corpus.append(" ".join(review))

In [8]:
len(corpus)

50000

In [9]:
clean_text["corpus"] = corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_text["corpus"] = corpus


In [10]:
len(clean_text)

50000

In [11]:
clean_text

Unnamed: 0,review_clean,corpus
0,"[one, of, the, other, reviewers, has, mentione...",one of the other reviewers has mentioned that ...
1,"[wonderful, little, production, br, br, the, f...",wonderful little production br br the filming ...
2,"[thought, this, was, wonderful, way, to, spend...",thought this was wonderful way to spend time o...
3,"[basically, there, family, where, little, boy,...",basically there family where little boy jake t...
4,"[petter, mattei, love, in, the, time, of, mone...",petter mattei love in the time of money is vis...
...,...,...
49995,"[thought, this, movie, did, down, right, good,...",thought this movie did down right good job it ...
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...",bad plot bad dialogue bad acting idiotic direc...
49997,"[am, catholic, taught, in, parochial, elementa...",am catholic taught in parochial elementary sch...
49998,"[going, to, have, to, disagree, with, the, pre...",going to have to disagree with the previous co...


In [12]:
X_train, X_test, y_train, y_test = train_test_split(clean_text, 
                                                    labels, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [13]:
tfidf_vect = TfidfVectorizer(max_features = 20000, ngram_range = (1,2) )
X_train_tfidf = tfidf_vect.fit_transform(X_train["corpus"])
X_test_tfidf = tfidf_vect.transform(X_test["corpus"])

In [14]:
svd = TruncatedSVD(n_components = 2000, n_iter=7, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)

In [15]:
print(sum(svd.explained_variance_))

0.4523164557189412


In [16]:
X_train_tfidf_df = pd.DataFrame(X_train_reduced, index = X_train.index, columns = [i for i in range(501, 2501)])
X_test_tfidf_df = pd.DataFrame(X_test_reduced, index = X_test.index, columns = [i for i in range(501, 2501)])

In [17]:
w2v_model = gensim.models.Word2Vec(X_train["review_clean"],
                                   vector_size=500,
                                   window=5,
                                   min_count=2)

In [18]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train["review_clean"]], dtype = "object")
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test["review_clean"]], dtype = "object")

In [19]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [20]:
X_train_df = pd.DataFrame(X_train_vect_avg, index = X_train.index)
X_test_df = pd.DataFrame(X_test_vect_avg, index = X_test.index)

In [21]:
X_train = pd.concat([X_train_df, X_train_tfidf_df], axis = 1)
X_test = pd.concat([X_test_df, X_test_tfidf_df], axis = 1)

In [22]:
X_train.to_csv("X_train.csv",index = False)
X_test.to_csv("X_test.csv",index = False)
y_train.to_csv("y_train.csv", index = False)
y_test.to_csv("y_test.csv",index = False)