In [1]:
import os
import random
import pandas as pd


In [2]:
# !pip install gensim

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# from gensim.models import word2vec
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
import multiprocessing as mp
# from cuml.dask.ensemble import RandomForestClassifier as cuRF_mg
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics # accuracy_score

In [4]:
FILENAME = 'yelp_new.csv'
COLUMN = 'sentence1'
df = pd.read_csv(FILENAME)
df.head(3)

Unnamed: 0,text,sentence1,sentence2,sentence3,stars,label
0,My wife took me here on my birthday for breakf...,my wife took me here on my birthday for breakf...,My wife took birthday breakfast excellent. The...,My wife took me here on my birth...,5,1
1,I have no idea why some people give bad review...,i have no idea why some people give bad review...,I idea people give bad reviews place. It goes ...,I have no idea why some people g...,5,1
2,love the gyro plate. Rice is so good and I als...,love the gyro plate rice is so good and i also...,love gyro plate. Rice good I also dig candy se...,love the gyro plate . Rice is so ...,4,1


In [10]:
class Kfold_CV():
    def __init__(self, X, y, n_splits):
        self.X, self.y = X.copy(), y.copy()
        self.n_splits = n_splits
        self.data_list = []

    def split_train_test(self, random_seed=42):
        sss = StratifiedShuffleSplit(n_splits=self.n_splits,
                                    test_size=1/self.n_splits,
                                    random_state=random_seed)
        for train_indx, test_index in sss.split(self.X, self.y):
            self.X_train, self.y_train = self.X[train_indx], self.y[train_indx]
            self.X_test, self.y_test = self.X[test_index], self.y[test_index]
            self.data_list.append((self.X_train, self.y_train, self.X_test, self.y_test))
        
        return None
    
    def train_test_step(self, X, y, testX, testy):
        self.model = RandomForestClassifier(
                                            n_estimators=100,
                                            # max_depth=13,
                                            n_jobs=mp.cpu_count()) # default
        print(self.model)
        self.model.fit(X, y)
        train_score = self.model.score(X, y)
        test_score = self.model.score(testX, testy)
        predy = self.model.predict(testX)

        return predy, train_score, test_score 

    def fit(self):
        train_acc, test_acc = 0, 0
        self.split_train_test()
        for i, data in enumerate(self.data_list, 1):
            # X_train, y_train, X_test, y_test = data
            predy, train_score, test_score = self.train_test_step(*data)
            print(f"{i}-fold".center(25, '='))
            print(f"Training score: {train_score:.4f}")
            print(f"Training score: {test_score:.4f}")

            train_acc += train_score
            test_acc += test_score
        train_acc = train_acc/self.n_splits
        test_acc = test_acc/self.n_splits
        print("=".center(25, '*'))
        print(f"{self.n_splits}-Fold Cross validation ")
        print(f"Training score mean: {train_acc:.4f}")
        print(f"Testing score mean : {test_acc:.4f}")
        return round(train_acc,4), round(test_acc,4)


## TF-IDF

In [12]:
columns = ['sentence1', 'sentence2', 'sentence3']

for column in columns:
    print(f"{column}".center(50, '+'))
    model_name = f'tfidf_{column}'
    tfidf = TfidfVectorizer(stop_words='english')
    #Replace NaN with an empty string
    df[column] = df[column].fillna('')
    tfidf_dataX = tfidf.fit_transform(df[column]).toarray()
    tfidf_datay = df['label'].apply(float).array
    # print(tfidf_dataX.shape, tfidf_datay.shape)
    Kfold_CV(tfidf_dataX, tfidf_datay, n_splits=5).fit()
    # break

++++++++++++++++++++++column++++++++++++++++++++++
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7890
RandomForestClassifier(n_jobs=8)
Training score: 1.0000
Training score: 0.7925
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7855
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7835
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7880
-------------------------
5-Fold Cross validation 
Training score mean: 0.9999
Testing score mean : 0.7877
++++++++++++++++++++++column++++++++++++++++++++++
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7995
RandomForestClassifier(n_jobs=8)
Training score: 1.0000
Training score: 0.7910
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7905
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7860
RandomForestClassifier(n_jobs=8)
Training score: 0.9999
Training score: 0.7940
-

## Word2Vec
- everyword will transform to a vector
- every review will extract some stop words and reconsist to tokens
- get words from tokens and transform it to vector and add them together

In [19]:
def word_to_vec(tokens, size, w2v):
    count = 0
    vec = np.zeros(size).reshape((1,size))
    for word in tokens:
        try:
            vec += w2v[word].reshape((1,size))
            count+=1
        except:
            continue
    
    # Normalize
    if count: 
        vec/=count

    return vec

[word2vec](https://stackoverflow.com/questions/53195906/getting-init-got-an-unexpected-keyword-argument-document-this-error-in) : something version problem about the input parameter

In [20]:
for column in columns:
    print(f"{column}".center(50, '+'))
    # w2v = Word2Vec(df[column], min_count=1, size=250, iter=30, sg=1) # the old version is depreciated
    print("Start to fit the word2vec process, it takes time ...")
    w2v = Word2Vec(df[column], min_count=1, vector_size=250, epochs=30, sg=1)
    w2v_X = np.zeros((len(df[column]), 250))
    for i in range(len(df[column])):
        w2v_X[i,:] = word_to_vec(df[column].iloc[i], 250, w2v)
    w2v_y = df['label'].apply(float).array
    Kfold_CV(w2v_X, w2v_y, n_splits=5).fit()
    


++++++++++++++++++++sentence1+++++++++++++++++++++
Start to fit the word2vec process, it takes time ...
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
-------------------------
5-Fold Cross validation 
Training score mean: 0.6863
Testing score mean : 0.6865
++++++++++++++++++++sentence2+++++++++++++++++++++
Start to fit the word2vec process, it takes time ...
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.6863
Training score: 0.6865
RandomForestClassifier(n_jobs=8)
Training score: 0.68