In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
df = pd.read_csv('sentiment_5_class.csv')
df.head()

Unnamed: 0,Phrase,Sentiment
0,injects just enough freshness into the proceed...,3
1,that,2
2,never plays as dramatic even when dramatic thi...,0
3,"None of this is very original , and it is n't ...",0
4,", Madonna gives her best performance since Abe...",3


In [3]:
X = df.Phrase.tolist()
y = df.Sentiment.tolist()

In [4]:
# Data split, Test set 20% data, train set 80% data, 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train_v = vectorizer.transform(X_train)
X_test_v = vectorizer.transform(X_test)

In [5]:
model = SVC(random_state=1,kernel='rbf', C=10.0)
model.fit(X_train_v, y_train)


SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

# Save model

In [6]:
import pickle

file_name = 'model.pkl'
pickle.dump(model, open(file_name, 'wb'))

In [7]:
!ls

'Class Remaining Topic Supervised Learning.ipynb'  'Lecture slides'	    SVM
'Decision Tree'					    model.pkl
'Ensemble Methods'				    sentiment_5_class.csv


# Load Model

In [8]:
file_name = 'model.pkl'
load_model = pickle.load(open(file_name, 'rb'))

In [9]:
load_model

SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [10]:
y_pred = load_model.predict(X_test_v)

In [11]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       247
           1       0.68      0.58      0.62       291
           2       0.65      0.63      0.64       469
           3       0.72      0.81      0.76      1759
           4       0.71      0.61      0.65       912

    accuracy                           0.71      3678
   macro avg       0.70      0.66      0.68      3678
weighted avg       0.71      0.71      0.70      3678



# Imbalanced Dataset

**Undersampling, Oversampling, SMOTE**

In [13]:
from collections import Counter

In [14]:
Counter(y_train)

Counter({3: 7033, 2: 1876, 1: 1165, 4: 3649, 0: 988})

In [15]:
from imblearn.under_sampling import RandomUnderSampler

Using TensorFlow backend.


In [16]:
under_sampler = RandomUnderSampler(random_state = 42)
x_under, y_under = under_sampler.fit_resample(X_train_v, y_train)

Counter(y_under)

Counter({0: 988, 1: 988, 2: 988, 3: 988, 4: 988})

In [17]:
model = SVC(random_state=1,kernel='rbf', C=10.0)
model.fit(x_under, y_under)


SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [19]:
from imblearn.over_sampling import RandomOverSampler

In [21]:
over_sampler = RandomOverSampler(random_state = 42)
x_over, y_over = over_sampler.fit_resample(X_train_v, y_train)

Counter(y_over)

Counter({3: 7033, 2: 7033, 1: 7033, 4: 7033, 0: 7033})

In [22]:
from imblearn.over_sampling import SMOTE #Data Interpolation

In [23]:
smote = SMOTE(random_state = 42)
x_smote, y_smote = smote.fit_resample(X_train_v, y_train)

Counter(y_smote)

Counter({3: 7033, 2: 7033, 1: 7033, 4: 7033, 0: 7033})

# Genetic Algorithm