In [None]:
import pandas as pd
from sklearn.svm import LinearSVC 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import normalize, RobustScaler, QuantileTransformer
from sklearn import metrics 
from collections import Counter 
from imblearn.ensemble import BalanceCascade 
from sklearn.linear_model import LogisticRegression 
from imblearn.combine import SMOTEENN,SMOTETomek 
import numpy as np
from sklearn.decomposition import PCA

In [None]:
def sampling(X,Y):
    Y=Y.as_matrix().flatten()
    bc = BalanceCascade(random_state=0,
                     estimator=LogisticRegression(random_state=0),
                     n_max_subset=1)
    X_resampled, Y_resampled = bc.fit_sample(X, Y)    
    print('After Sampling: ',sorted(Counter(Y_resampled[0]).items()))
    return (X_resampled, Y_resampled)



In [None]:
def over_sampling_SMOTEENN(X,y):
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_sample(X, y)
    print(sorted(Counter(y_resampled).items()))
    return (X_resampled, y_resampled)

In [None]:
def over_sampling_SMOTETomek(X,y):
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_sample(X, y)
    print(sorted(Counter(y_resampled).items()))
    return (X_resampled, y_resampled)

In [None]:
def evaluation(Y_test,predictions):
    # evaluate predictions
    accuracy = metrics.accuracy_score(Y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("RMSE:{0}".format(metrics.mean_squared_error(Y_test, predictions)))
    print("Classification Report")
    print(metrics.classification_report(Y_test, y_pred))
    sumOfError = 0.0
    errorDict = {}
    samplingError = {}
    totalError=0.0
    for i in range(5):
        samplingError[i] = {}
        for j in range(5):
            samplingError[i][j] = 0
    sumOfSquareError = 0.0
    for i in range(len(Y_test)):
        error = (abs(Y_test[i] - predictions[i]))
        if error not in errorDict.keys():
            errorDict[error] = 0
        errorDict[error] += 1
        samplingError[int(Y_test[i])-1][int(predictions[i])-1] += 1
        sumOfError += error/Y_test[i]
        sumOfSquareError += error ** 2
        totalError += error
    
    print('Total values : '+str(len(Y_test)))    
    print(errorDict)
    print(' \t1\t2\t3\t4\t5')
    print('1\t'+str(samplingError[0][0])+'\t'+str(samplingError[0][1])+'\t'+str(samplingError[0][2])+'\t'+str(samplingError[0][3])+'\t'+str(samplingError[0][4]))
    print('2\t'+str(samplingError[1][0])+'\t'+str(samplingError[1][1])+'\t'+str(samplingError[1][2])+'\t'+str(samplingError[1][3])+'\t'+str(samplingError[1][4]))
    print('3\t'+str(samplingError[2][0])+'\t'+str(samplingError[2][1])+'\t'+str(samplingError[2][2])+'\t'+str(samplingError[2][3])+'\t'+str(samplingError[2][4]))
    print('4\t'+str(samplingError[3][0])+'\t'+str(samplingError[3][1])+'\t'+str(samplingError[3][2])+'\t'+str(samplingError[3][3])+'\t'+str(samplingError[3][4]))
    print('5\t'+str(samplingError[4][0])+'\t'+str(samplingError[4][1])+'\t'+str(samplingError[4][2])+'\t'+str(samplingError[4][3])+'\t'+str(samplingError[4][4]))
    print(samplingError)
    print("MAPE : "+str((sumOfError/len(Y_test))*100))
    print("RMSE :"+str(sqrt(sumOfSquareError/len(Y_test))))
    print("MAE:"+str(totalError/len(Y_test)))

In [None]:
def run_QuantileTransformer(X_train,Y_train,X_test):
    qt = QuantileTransformer(n_quantiles=10, random_state=0)
    qt.fit_transform(X_train,Y_train)
    qt.transform(X_test)
    return X_train,Y_train,X_test

In [None]:
def run_RobustScaler(X_train,Y_train,X_test):
    rs = RobustScaler()
    rs.fit_transform(X_train,Y_train)
    rs.transform(X_test)
    return X_train,Y_train,X_test

In [None]:
def Normalization(X_train,X_test):
    X_train = normalize(X_train)
    X_test = normalize(X_test)
    return X_train, X_test

In [None]:
def run_SVM(X_train, Y_train,X_test):
    model = LinearSVC(C=0.1)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [None]:
def run_PCA(train, test):
    pca = PCA(n_components=30)
    pca.fit(train)
    train = pca.transform(train)
    test = pca.transform(test)
    return train,test

In [None]:
df = pd.read_csv('reviewTable150.csv')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df.loc[:,df.columns != 'Rating'],df['Rating'], test_size=0.3,random_state=7)

In [None]:
d=df.groupby('Rating')
print(d['good'].agg(np.size))

In [None]:
# run SVM using PCA and normalization
x_train, x_test = Normalization(X_train,X_test)
x_train, x_test = run_PCA(x_train, x_test)
y_train=Y_train


In [None]:
# run SVM using PCA and QuantileTransformer
x_train,y_train, x_test = run_QuantileTransformer(X_train,Y_train,X_test)
x_train, x_test = run_PCA(x_train, x_test)


In [None]:
#run svm using PCA and robustScaler
x_train,y_train, x_test = run_RobustScaler(X_train,Y_train,X_test)
x_train, x_test = run_PCA(x_train, x_test)

In [None]:
#  if run SVM with under sampling
x_train, y_train=sampling(x_train,y_train)
x_train = x_train[0]
y_train = y_train[0]

In [None]:
#  if run SVM with over sampling using SMOTEENN
x_train, y_train = over_sampling_SMOTEENN(x_train,y_train)

In [None]:
#  if run SVM with over sampling using SMOTETomek
x_train, y_train = over_sampling_SMOTETomek(x_train,y_train)

In [None]:
#  run Linear SVM and evaluate the result
y_pred=run_SVM(x_train,y_train, x_test)
evaluation(Y_test.values,y_pred)