# Text Classification using
## 1. Bag of words
## 2. TfIdf

### Reading the data and taking a sample out of it because training it will take time

In [1]:
import pandas as pd

temp_df = pd.read_csv(r"C:\Users\rezaa\Desktop\IMDB Dataset.csv")
df = temp_df.sample(10000)
df.head()

Unnamed: 0,review,sentiment
36236,Largely dense road movie with some comic relie...,negative
10905,I just saw this at the 2006 Vancouver internat...,negative
49543,the acting is good.thats the positives out of ...,negative
4673,"Watched this film at a local festival, the Sil...",positive
34789,Universal studios. The name conjures up so man...,positive


### Counting the negative and positive reviews so that the data is not unbalanced

In [2]:
df.sentiment.value_counts()

sentiment
negative    5049
positive    4951
Name: count, dtype: int64

### Checking if there is any null value

In [3]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

### Removing duplicate values

In [4]:
df.drop_duplicates(inplace=True)

### Removing html tags using Beautifulsoup as it is much easier

In [5]:
#removing html tags

from bs4 import BeautifulSoup
df['review'] = df.review.apply(lambda x : BeautifulSoup(x, "lxml").text)


  df['review'] = df.review.apply(lambda x : BeautifulSoup(x, "lxml").text)


In [6]:
df.head()

Unnamed: 0,review,sentiment
36236,Largely dense road movie with some comic relie...,negative
10905,I just saw this at the 2006 Vancouver internat...,negative
49543,the acting is good.thats the positives out of ...,negative
4673,"Watched this film at a local festival, the Sil...",positive
34789,Universal studios. The name conjures up so man...,positive


### Changing it into lower case

In [7]:
df['review'] = df.review.apply(lambda x : x.lower())
df.head()

Unnamed: 0,review,sentiment
36236,largely dense road movie with some comic relie...,negative
10905,i just saw this at the 2006 vancouver internat...,negative
49543,the acting is good.thats the positives out of ...,negative
4673,"watched this film at a local festival, the sil...",positive
34789,universal studios. the name conjures up so man...,positive


### Spliting the data into input array 'X' and output array 'y'

In [8]:
X = df.review
y = df.sentiment

### Using simple_preprocess from gensim library as it is very easy to implement

In [9]:
from gensim.utils import simple_preprocess
X = X.apply(lambda x: simple_preprocess(x))
X.head()

36236    [largely, dense, road, movie, with, some, comi...
10905    [just, saw, this, at, the, vancouver, internat...
49543    [the, acting, is, good, thats, the, positives,...
4673     [watched, this, film, at, local, festival, the...
34789    [universal, studios, the, name, conjures, up, ...
Name: review, dtype: object

### As simple_preprocess uses word tokenize and Count Vectorize uses sent tokenize so joined the word tokenize

In [10]:
X = X.apply(lambda x: ' '.join(x))
X.head()

36236    largely dense road movie with some comic relie...
10905    just saw this at the vancouver international f...
49543    the acting is good thats the positives out of ...
4673     watched this film at local festival the silver...
34789    universal studios the name conjures up so many...
Name: review, dtype: object

### Using label encoder on output

In [11]:
y

36236    negative
10905    negative
49543    negative
4673     positive
34789    positive
           ...   
48239    positive
23418    positive
22773    positive
15390    positive
23271    positive
Name: sentiment, Length: 9979, dtype: object

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y = y.reshape(-1,1)
y

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

### Using train test split

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

## 1. Using count_vectorise for bag_of_words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()
X_train_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Using Naive Bayes for training data

In [44]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
nb.fit(X_train_cv, y_train)
y_pred_nb = nb.predict(X_test_cv)

  y = column_or_1d(y, warn=True)


In [45]:
from sklearn.metrics import accuracy_score, confusion_matrix
#on train data
ac = accuracy_score(y_test, y_pred_nb)
cm = confusion_matrix(y_test, y_pred_nb)

In [46]:
print(ac)

0.8491983967935872


In [47]:
print(cm)

[[880 114]
 [187 815]]


### Using RandomForest

In [48]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_cv, y_train)
y_pred_rf = rf.predict(X_test_cv)

  return fit_method(estimator, *args, **kwargs)


In [49]:
#on test data
ac = accuracy_score(y_test, y_pred_rf)
cm = confusion_matrix(y_test, y_pred_rf)
print(ac)
print(cm)

0.8361723446893787
[[861 133]
 [194 808]]


## 2. Using TfIdf to vectorise

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X_train_tf = tf.fit_transform(X_train).toarray()
X_test_tf = tf.transform(X_test).toarray()
X_train_tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Using RandomForest

In [74]:
rf2 = RandomForestClassifier()
rf2.fit(X_train_tf, y_train)
y_pred_rf2 = rf2.predict(X_test_tf)

  return fit_method(estimator, *args, **kwargs)


In [75]:
#on test data
ac_rf = accuracy_score(y_test, y_pred_rf2)
cm_rf = confusion_matrix(y_test, y_pred_rf2)
print(ac_rf)
print(cm_rf)

0.8306613226452906
[[867 127]
 [211 791]]


## 3. Using word2vec then average word2vec to vectorise each document

### Since html tags are removed and texts are lower cased now the stopwords are left to be removed

In [102]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
X_new = X.apply(lambda x: remove_stopwords(x))
X_new = X_new.apply(lambda x: word_tokenize(x))
X_new.head()

36236    [largely, dense, road, movie, comic, relief, p...
10905    [saw, vancouver, international, film, festival...
49543    [acting, good, thats, positives, way, sosn, sh...
4673     [watched, film, local, festival, silver, sproc...
34789    [universal, studios, conjures, memories, horro...
Name: review, dtype: object

### Training a model to vectorise each word using Word2vec technique

In [105]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=X_new, vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(X_new)
model.train(X_new, total_examples=model.corpus_count, epochs=model.epochs)

(4916542, 5100210)

### Converting each element in row as average of all the vectors of the words

In [122]:
sent_vec = X_new.apply(lambda x : np.mean([model.wv[word] for word in x], axis=0))
sent_vec

36236    [-0.065383434, 0.7738144, 0.42077214, 0.146440...
10905    [-0.13664493, 0.3707316, 0.032249875, 0.254868...
49543    [-0.33387658, 0.34524018, 0.07007837, -0.03389...
4673     [-0.48977193, 0.70593643, 0.41908643, 0.408738...
34789    [-0.18688276, 0.56751674, 0.23258784, 0.220401...
                               ...                        
48239    [-0.31897798, 0.65793484, 0.40024328, 0.359653...
23418    [-0.088541485, 0.38024762, 0.22750792, 0.04591...
22773    [-0.2347524, 0.36040795, 0.1202022, 0.16945086...
15390    [-0.11057853, 0.44843295, 0.052475844, 0.04737...
23271    [-0.22245479, 0.43733826, 0.15206936, 0.049067...
Name: review, Length: 9979, dtype: object

### Converting the above series into a 2d numpy array as it was getting converted into array(array([....] ,array[...], ...) instead of array([.....],[.....],....)

In [146]:
X_vec = np.stack(sent_vec.tolist())
X_vec

array([[-0.06538343,  0.7738144 ,  0.42077214, ..., -0.62768596,
        -0.31341848, -0.16729462],
       [-0.13664493,  0.3707316 ,  0.03224988, ..., -0.84445596,
        -0.2669913 , -0.11907455],
       [-0.33387658,  0.34524018,  0.07007837, ..., -0.7996561 ,
         0.04867775, -0.08257327],
       ...,
       [-0.2347524 ,  0.36040795,  0.1202022 , ..., -0.5337785 ,
         0.08287903, -0.04043095],
       [-0.11057853,  0.44843295,  0.05247584, ..., -0.39425895,
         0.05725573,  0.12360921],
       [-0.22245479,  0.43733826,  0.15206936, ..., -0.55844855,
         0.14156538,  0.03606821]], dtype=float32)

### Training in a random forest

In [147]:
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(X_vec, y, test_size=0.20, random_state=1)
rf3 = RandomForestClassifier()
rf3.fit(X_train_vec, y_train_vec)
y_pred_rf3 = rf3.predict(X_test_vec)
ac_rf = accuracy_score(y_test_vec, y_pred_rf3)
cm_rf = confusion_matrix(y_test_vec, y_pred_rf3)
print(ac_rf)
print(cm_rf)

  return fit_method(estimator, *args, **kwargs)


0.7920841683366734
[[784 210]
 [205 797]]
