# Stanford Sentiment Treebank


In [1]:
import numpy as np 
import pandas as pd 

import gc, re, optuna
from scipy import stats

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading Data

In [2]:
train = pd.read_csv('../input/stanford-sentiment-treebank/train.csv')
test = pd.read_csv('../input/stanford-sentiment-treebank/test.csv')

train.shape, test.shape

((67349, 3), (872, 2))

# Understanding Data

In [3]:
train.head(10)

Unnamed: 0,id,target,sentence
0,16399,0,b'for the uninitiated plays better on video wi...
1,1680,0,b'like a giant commercial for universal studio...
2,47917,1,b'company once again dazzle and delight us '
3,17307,1,"b""'s no surprise that as a director washington..."
4,27051,0,"b', this cross-cultural soap opera is painfull..."
5,67082,1,"b"", the film is n't nearly as downbeat as it s..."
6,54784,0,b'only masochistic moviegoers need apply . '
7,50543,1,b'convince almost everyone that it was put on ...
8,48736,1,"b""like the english patient and the unbearable ..."
9,26263,1,b'his supple understanding of the role '


In [4]:
test.head(10)

Unnamed: 0,id,sentence
0,787,b'a valueless kiddie paean to pro basketball u...
1,458,"b""featuring a dangerously seductive performanc..."
2,423,b'i am sorry that i was unable to get the full...
3,466,b'the inspirational screenplay by mike rich co...
4,545,"b""from the opening scenes , it 's clear that a..."
5,71,b'exquisitely nuanced in mood tics and dialogu...
6,675,b'slick piece of cross-promotion . '
7,250,"b""one of the more intelligent children 's movi..."
8,321,b'but it could have been worse . '
9,106,"b""the movie 's relatively simple plot and unco..."


In [5]:
train.target.value_counts()

1    37569
0    29780
Name: target, dtype: int64

In [6]:
train = train.drop_duplicates()
train.shape

(67349, 3)

# Preprocessing

- Data Clean

In [7]:
def data_clean(document):
    
    document = document.replace("b',", '')
    document = document.replace("b'", '')
    
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
    
    return document

train.sentence.head().apply(data_clean)

0    for the uninitiated plays better on video with...
1    like giant commercial for universal studios   ...
2           company once again dazzle and delight us  
3    b no surprise that as director washington dema...
4     this cross cultural soap opera is painfully f...
Name: sentence, dtype: object

In [8]:
train['text'] = train.sentence.apply(data_clean)
test['text'] = test.sentence.apply(data_clean)

In [9]:
vectorizer = TfidfVectorizer(
    stop_words = 'english',
    lowercase = True,
    #min_df=0.001357
)

In [10]:
X = vectorizer.fit_transform(train['text'])
x_test = vectorizer.transform(test['text'])

- Get target data

In [11]:
y = train.target.copy()

# Splitting Data

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y, shuffle=True)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((53879, 13507), (13470, 13507), (53879,), (13470,))

# Modeling

### Search best model

In [13]:
models = [XGBClassifier(), LGBMClassifier(), CatBoostClassifier(verbose=False)]

for model in models:
    print(f"Model {model} \n")
    # Fit Model
    model.fit(X_train, y_train)

    # Prediction
    y_pred = model.predict(X_valid)

    # Evaluation
    print("ROC AUC Score :", roc_auc_score(y_valid, y_pred))
    print("F1-score :", f1_score(y_valid, y_pred), "\n")

Model XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...) 

ROC AUC Score : 0.7199862696416433
F1-score : 0.8048280573901162 

Model LGBMClassifier() 

ROC AUC Score : 0.7454576172385088
F1-score : 0.8189140638581446 

Model <catboost.core.CatBoostClassifier object at 0x7ff23798d590> 

ROC AUC Score : 0.790170

# Cross-Validation

In [14]:
skf = StratifiedKFold(n_splits=9, random_state=123, shuffle=True)

y_prediction = []
y_pred_prob  = []
scores = []

for fold, (train_index, val_index) in enumerate(skf.split(train['text'], y)):
    print('*'*14, f" Fold {fold} ", '*'*14, '\n')

    # Splitting Data
    X_train, X_valid = train['text'][train_index], train['text'][val_index]
    y_train, y_valid = y[train_index], y[val_index]
    
    #Vectorize sentences
    X_train = vectorizer.fit_transform(X_train)
    X_valid = vectorizer.transform(X_valid)
    x_test_ = vectorizer.transform(test['text'])
    
    # Define model
    catb_model = CatBoostClassifier(verbose=False)

    # Fit Model
    catb_model.fit(X_train, y_train)

    # Prediction
    y_pred = catb_model.predict(X_valid)

    # Evaluation
    ras = roc_auc_score(y_valid, y_pred)
    scores.append(ras)
    print("ROC AUC Score :", ras)
    print("F1-score :", f1_score(y_valid, y_pred), '\n')
    
    # Make test prediction
    y_prediction.append( catb_model.predict(x_test_) )
    y_pred_prob.append( catb_model.predict_proba(x_test_)[:,1] )
    
    # Free the memory
    del X_train, X_valid, y_train, y_valid, catb_model, y_pred, ras
    gc.collect()

**************  Fold 0  ************** 

ROC AUC Score : 0.7948832706300908
F1-score : 0.8486218776916452 

**************  Fold 1  ************** 

ROC AUC Score : 0.7979680168222033
F1-score : 0.8503886010362695 

**************  Fold 2  ************** 

ROC AUC Score : 0.7901923620773766
F1-score : 0.8457658044434904 

**************  Fold 3  ************** 

ROC AUC Score : 0.7914026707373989
F1-score : 0.8449821756508589 

**************  Fold 4  ************** 

ROC AUC Score : 0.7836854823633704
F1-score : 0.8408749731932232 

**************  Fold 5  ************** 

ROC AUC Score : 0.7883328967490473
F1-score : 0.8439716312056739 

**************  Fold 6  ************** 

ROC AUC Score : 0.7865986869456085
F1-score : 0.8420372563798857 

**************  Fold 7  ************** 

ROC AUC Score : 0.7916422490795167
F1-score : 0.8452316664866616 

**************  Fold 8  ************** 

ROC AUC Score : 0.7919042929859749
F1-score : 0.8451891891891893 



In [15]:
print("Mean ROC AUC Score : ", np.array(scores).mean(axis=0) )

Mean ROC AUC Score :  0.790734436487843


In [16]:
# Define model
model = CatBoostClassifier(verbose=False)

# Fit Model
model.fit(X, y)

# Make test prediction
y_prediction.append( model.predict(x_test) )
y_pred_prob.append( model.predict_proba(x_test)[:,1] )

In [17]:
preds = stats.mode(y_prediction, axis=0)[0][0]
len(preds), test.shape[0]

(872, 872)

# Submission

In [18]:
sub = pd.read_csv('../input/stanford-sentiment-treebank/sample_submission.csv')
sub.target = preds

sub.target.value_counts()

1    527
0    345
Name: target, dtype: int64

In [19]:
sub.to_csv("submission.csv", index=False)

- Save preiction proba data

In [20]:
sub.target = np.array(y_pred_prob).mean(axis=0)
sub.to_csv("submission_proba.csv", index=False)