## Import Dependencies

In [1]:
import os
import time
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
import joblib
from scipy import sparse, io

## Go To Directory

In [3]:
os.listdir('/kaggle/input/')

['vaders', 'newdata']

## Import Datasets

In [4]:
vaders = pd.read_csv('../input/vaders/vaders.csv')

In [5]:
newData = pd.read_csv('../input/newdata/7reviewsPrepared.csv')

In [6]:
raw = pd.read_csv('../input/newdata/5reviews.csv')

In [7]:
scores = pd.read_csv('../input/newdata/7scores.csv')

In [8]:
raw["scores"] = scores["score"]

## Vaders Neg-Pos Rescoring 

In [9]:
vaders["Neg-Pos"] = vaders["compound"].apply(lambda x: "neg" if x< 0.5 else "pos")

In [28]:
vaders["neg-pos"] = vaders["compound"].apply(lambda x: 0 if x< 0.5 else 1)

In [29]:
vaders.head()

Unnamed: 0,index,Text,neg,neu,pos,compound,Score,SScoring,Sentiment,Neg-Pos,neg-pos
0,0,bought several vitality canned dog food produc...,0.0,0.517,0.483,0.9413,5,2,pos,pos,1
1,1,product arrived labeled jumbo salted peanutsth...,0.129,0.762,0.11,-0.1027,1,0,neg,neg,0
2,2,confection around century light pillowy citrus...,0.13,0.584,0.286,0.8624,4,2,pos,pos,1
3,3,looking secret ingredient robitussin believe f...,0.0,0.868,0.132,0.4404,2,1,neu,neg,0
4,4,great taffy great price wide assortment yummy ...,0.0,0.369,0.631,0.9468,5,2,pos,pos,1


## Train - Test Split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(vaders["Text"], vaders["neg-pos"], test_size=0.25, random_state=1)

## CountVectorizer

In [31]:
count = CountVectorizer()

In [32]:
count.fit(x_train.apply(lambda x: np.str_(x)))

In [33]:
x_train_count = count.transform(x_train.values.astype('U'))
x_test_count = count.transform(x_test.values.astype('U'))

In [34]:
np.asarray(count)

array(CountVectorizer(), dtype=object)

## Random Forest Classifier

In [16]:
rf = RandomForestClassifier()

In [17]:
rf_model_count = rf.fit(x_train_count, y_train)

In [18]:
y_pred = rf_model_count.predict(x_test_count)
accuracy_score(y_test, y_pred)

0.9161447851724672

### Test on New Data

In [19]:
newData.isnull().sum()

review    13
dtype: int64

In [20]:
newData = newData.dropna()

In [21]:
raw["rf"] = newData["review"].apply(lambda x: rf_model_count.predict(count.transform(pd.Series(x))))

In [23]:
raw.head()

Unnamed: 0,review,score,scores,rf
0,"The only problem, I actually counted the amoun...",4.0,4.0,[neg]
1,Me and my wife and my kids love them,5.0,5.0,[pos]
2,Definitely one of the best candies you can buy...,5.0,5.0,[pos]
3,"The price is great, but there are mostly grape...",3.0,3.0,[pos]
4,"Box and packaging in great shape, kids love them.",5.0,5.0,[pos]


### Export Model

In [24]:
os.listdir('/kaggle/working/')

['__notebook_source__.ipynb', '.virtual_documents']

In [25]:
joblib.dump(rf_model_count,"../working/rf.joblib")

['../working/rf.joblib']

## XGBoost Classifier

In [35]:
xgb = XGBClassifier()

In [36]:
xgb_model_count = xgb.fit(x_train_count, y_train, verbose=2)

In [37]:
y_pred = xgb_model_count.predict(x_test_count)
accuracy_score(y_test, y_pred)

0.9161096021503863

### Test on New Data

In [38]:
newData.isnull().sum()

review    0
dtype: int64

In [39]:
newData = newData.dropna()

In [63]:
raw["xgb"] = newData["review"].apply(lambda x: xgb_model_count.predict(count.transform(pd.Series(x))))

In [64]:
raw.head()

Unnamed: 0,review,score,rf,xgb
0,"The only problem, I actually counted the amoun...",4.0,[neg],[0]
1,Me and my wife and my kids love them,5.0,[pos],[1]
2,Definitely one of the best candies you can buy...,5.0,[pos],[1]
3,"The price is great, but there are mostly grape...",3.0,[pos],[1]
4,"Box and packaging in great shape, kids love them.",5.0,[pos],[1]


### Export Model

In [42]:
os.listdir('/kaggle/working/')

['rf.joblib', '__notebook_source__.ipynb', '.virtual_documents']

In [43]:
joblib.dump(xgb_model_count,"../working/xgb.joblib")

['../working/xgb.joblib']

### Export Results

In [56]:
raw.head()

Unnamed: 0,review,score,rf,xgb
0,"The only problem, I actually counted the amoun...",4.0,[neg],[0]
1,Me and my wife and my kids love them,5.0,[pos],[1]
2,Definitely one of the best candies you can buy...,5.0,[pos],[1]
3,"The price is great, but there are mostly grape...",3.0,[pos],[1]
4,"Box and packaging in great shape, kids love them.",5.0,[pos],[1]


In [78]:
raw["xgb"] = raw["xgb"].apply(lambda x: "[neg]" if(x== [0]) else "[pos]")

In [79]:
raw = raw.dropna()

In [82]:
os.listdir('/kaggle/working/')

['rf.joblib', '__notebook_source__.ipynb', 'xgb.joblib', '.virtual_documents']

In [83]:
raw.to_csv('9RFandXGB.csv', index=False, header=True)