## MODEL APPLICATION on NEW DATASET

## Import Dependencies

In [1]:
import os
import time
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [3]:
import joblib
from scipy import io

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Import Raw Reviews Data

- First Read Raw Reviews

In [74]:
#raw = pd.read_csv('5reviews.csv')

In [75]:
#raw = raw[:4720]

- For Second Runs

In [5]:
raw = pd.read_csv("8Scored.csv")

In [6]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char,cb_count,cb_ngram
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0],[[0]],[[0]]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]


## Import Prepared Reviews Data

In [7]:
reviews = pd.read_csv('7reviewsPrepared.csv')

In [8]:
reviews.head()

Unnamed: 0,review
0,problem actually counted amount box box
1,wife kids love
2,definitely one best candies buy selling school...
3,price great mostly grape orange flavors box
4,box packaging great shape kids love them


In [9]:
reviews.describe().T

Unnamed: 0,count,unique,top,freq
review,4707,4592,good,17


In [10]:
reviews.isnull().sum()

review    13
dtype: int64

In [11]:
reviews = reviews.dropna()

In [12]:
reviews.isnull().sum()

review    0
dtype: int64

## Import Past VADER Scores for Fitting

In [13]:
vaders = pd.read_csv("D:\\Study\\4.1\\DesignProject\\Code\\source\\vaders.csv")

## VADER Rescoring

In [14]:
vaders["Neg-Pos"] = vaders["compound"].apply(lambda x: 0 if x< 0.5 else 1)

In [15]:
vaders.head()

Unnamed: 0,index,Text,neg,neu,pos,compound,Score,SScoring,Sentiment,Neg-Pos
0,0,bought several vitality canned dog food produc...,0.0,0.517,0.483,0.9413,5,2,pos,1
1,1,product arrived labeled jumbo salted peanutsth...,0.129,0.762,0.11,-0.1027,1,0,neg,0
2,2,confection around century light pillowy citrus...,0.13,0.584,0.286,0.8624,4,2,pos,1
3,3,looking secret ingredient robitussin believe f...,0.0,0.868,0.132,0.4404,2,1,neu,0
4,4,great taffy great price wide assortment yummy ...,0.0,0.369,0.631,0.9468,5,2,pos,1


## Train - Test Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(vaders["Text"], vaders["Neg-Pos"], test_size=0.25, random_state=1)

## Model Predictions

### Random Forest

- Import Models

In [15]:
rf_count = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\rf_model_count.joblib")

In [18]:
rf_ngram = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\rf_model_ngram.joblib")

In [17]:
rf_word= joblib.load("D:\\Study\\4.1\\DesignProject\\models\\rf_model_word.joblib")

In [17]:
rf_chars = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\rf_model_chars.joblib")

#### CountVectorizer

In [16]:
countV = CountVectorizer()

In [17]:
countV.fit(x_train.apply(lambda x: np.str_(x)))

In [77]:
raw["rf_count"] = reviews["review"].apply(lambda x: rf_count.predict(countV.transform(pd.Series(x))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

In [None]:
raw.head()

- Export Scores

In [78]:
df = pd.DataFrame(raw)
df.to_csv(r'9Scored.csv', index=False, header=True)

#### TF-IDF Ngram Level

In [19]:
ngramV = TfidfVectorizer(ngram_range = (1,2))

In [20]:
ngramV.fit(x_train.apply(lambda x: np.str_(x)))

In [21]:
raw["rf_ngram"] = reviews["review"].apply(lambda x: rf_ngram.predict(ngramV.transform(pd.Series(x))))

In [22]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram
0,"The only problem, I actually counted the amoun...",4.0,[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2]


- Write Above CSV

In [24]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Word Level

In [18]:
wordV = TfidfVectorizer()

In [19]:
wordV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["rf_word"] = reviews["review"].apply(lambda x: rf_word.predict(wordV.transform(pd.Series(x))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Character Level

In [18]:
charV = TfidfVectorizer(analyzer="char", ngram_range=(1,2))

In [19]:
charV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["rf_char"] = reviews["review"].apply(lambda x: rf_chars.predict(charV.transform(pd.Series(x))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

### XGBoost

- Import Models

In [17]:
xgb_count = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\xgb_model_count.joblib")

In [17]:
xgb_ngram = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\xgb_model_ngram.joblib")

In [17]:
xgb_word = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\xgb_model_word.joblib")

In [26]:
xgb_chars = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\xgb_model_chars.joblib")

#### CountVectorizer

In [18]:
countV = CountVectorizer()

In [19]:
countV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["xgb_count"] = reviews["review"].apply(lambda x: xgb_count.predict(countV.transform(pd.Series(x))))

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Ngram Level

In [18]:
ngramV = TfidfVectorizer(ngram_range = (1,2))

In [19]:
ngramV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["xgb_ngram"] = reviews["review"].apply(lambda x: xgb_ngram.predict(ngramV.transform(pd.Series(x))))

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Word Level

In [18]:
wordV = TfidfVectorizer()

In [19]:
wordV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["xgb_word"] = reviews["review"].apply(lambda x: xgb_word.predict(wordV.transform(pd.Series(x))))

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Character Level

In [23]:
charV = TfidfVectorizer(analyzer="char", ngram_range=(1,2))

In [24]:
charV.fit(x_train.apply(lambda x: np.str_(x)))

In [27]:
raw["xgb_char"] = reviews["review"].apply(lambda x: xgb_chars.predict(charV.transform(pd.Series(x))))

In [28]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2]


- Write Above CSV

In [29]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

### CatBoost

- Import Models

In [16]:
cb_count = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\cb_count.joblib")

In [26]:
cb_ngram = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\cb_ngram.joblib")

In [17]:
cb_word = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\cb_word.joblib")

In [18]:
cb_chars = joblib.load("D:\\Study\\4.1\\DesignProject\\models\\cb_chars.joblib")

#### CountVectorizer

In [18]:
countV = CountVectorizer()

In [19]:
countV.fit(x_train.apply(lambda x: np.str_(x)))

In [20]:
raw["cb_count"] = reviews["review"].apply(lambda x: cb_count.predict(countV.transform(pd.Series(x))))

In [21]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char,cb_count
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0],[[0]]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]]


- Write Above CSV

In [22]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Ngram Level

In [29]:
ngramV = TfidfVectorizer(ngram_range = (1,2))

In [30]:
ngramV.fit(x_train.apply(lambda x: np.str_(x)))

In [31]:
raw["cb_ngram"] = reviews["review"].apply(lambda x: cb_ngram.predict(ngramV.transform(pd.Series(x))))

In [32]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char,cb_count,cb_ngram
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0],[[0]],[[0]]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]]


- Write Above CSV

In [33]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Word Level

In [19]:
wordV = TfidfVectorizer()

In [20]:
wordV.fit(x_train.apply(lambda x: np.str_(x)))

In [21]:
raw["cb_word"] = reviews["review"].apply(lambda x: cb_word.predict(wordV.transform(pd.Series(x))))

In [22]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char,cb_count,cb_ngram,cb_word
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0],[[0]],[[0]],[[0]]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]]


- Write Above CSV

In [23]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)

#### TF-IDF Character Level

In [24]:
charV = TfidfVectorizer(analyzer="char", ngram_range=(1,2))

In [25]:
charV.fit(x_train.apply(lambda x: np.str_(x)))

In [27]:
raw["cb_chars"] = reviews["review"].apply(lambda x: cb_chars.predict(charV.transform(pd.Series(x))))

In [28]:
raw.head()

Unnamed: 0,review,score,rf_count,rf_ngram,rf_word,rf_char,xgb_count,xgb_ngram,xgb_word,xgb_char,cb_count,cb_ngram,cb_word,cb_chars
0,"The only problem, I actually counted the amoun...",4.0,[0],[0],[0],[0],[0],[0],[0],[0],[[0]],[[0]],[[0]],[[0]]
1,Me and my wife and my kids love them,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]],[[2]]
2,Definitely one of the best candies you can buy...,5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]],[[2]]
3,"The price is great, but there are mostly grape...",3.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]],[[2]]
4,"Box and packaging in great shape, kids love them.",5.0,[2],[2],[2],[2],[2],[2],[2],[2],[[2]],[[2]],[[2]],[[2]]


- Write Above CSV

In [29]:
df = pd.DataFrame(raw)
df.to_csv('8Scored.csv', index=False, header=True)