In [94]:
from nb import *
from munching import *

In [95]:
with open('dataset.json') as df:
    masterDset = json.load(df)
ucutDset = filterMinSize(masterDset, 20)
lcutDset = filterMaxSize(masterDset, 19)
N = len(ucutDset)
LN = len(lcutDset)

In [96]:
# sentiment analyser for comments
def trainedNB(lcutDset,ratingThresh):
    LN = len(lcutDset)
    X,y = prepNBDset(lcutDset,ratingThresh,1)
    train_x, test_x, train_y, test_y = train_test_split(X,y,test_size=0.33,random_state=42)
    nbModel = NBSentimentClassifier()
    nbModel.fit(train_x,train_y)
    print('Thresh : {}'.format(ratingThresh))
    print('Training accuracy : {}'.format(nbModel.score(train_x,train_y)))
    print('Test accuracy : {}'.format(nbModel.score(test_x,test_y)))
    return nbModel

In [97]:
# prediction of average rating for movie, corresponding to given sentiment analyser model
def predictMovieRating(comments, model):
    return np.mean(model.predict(comments))

### Train different sentiment analysers by differing the threshold for good and bad comments

In [98]:
nb3 = trainedNB(lcutDset,3)

Thresh : 3
Training accuracy : 0.9402180424104468
Test accuracy : 0.921692607003891


In [99]:
nb5 = trainedNB(lcutDset,5)

Thresh : 5
Training accuracy : 0.8646220198873847
Test accuracy : 0.748784046692607


In [100]:
nb6 = trainedNB(lcutDset,5)

Thresh : 5
Training accuracy : 0.8646220198873847
Test accuracy : 0.748784046692607


In [101]:
nb7 = trainedNB(lcutDset,7)

Thresh : 7
Training accuracy : 0.8480891338205343
Test accuracy : 0.6777723735408561


In [102]:
nb8 = trainedNB(lcutDset,8)

Thresh : 8
Training accuracy : 0.8718102312207979
Test accuracy : 0.7838035019455253


In [103]:
nb9 = trainedNB(lcutDset,9)

Thresh : 9
Training accuracy : 0.9138612675212652
Test accuracy : 0.8779182879377432


In [104]:
sampleMovie = list(ucutDset.keys())[0]
print(sampleMovie)

comments = [x['Title'] for x in ucutDset[sampleMovie]['Comments']]
print(comments[:2])

print(predictMovieRating(comments,nb5))

avg_rating = ucutDset[sampleMovie]['Rating']
print(avg_rating)

Titanic


In [108]:
def prepDset(dset,sentimentModels):
#     dset = filterMinSize(fname,minSize)
    X = []
    y = []
    for movie in dset:
        comments = [x['Title'] for x in dset[movie]['Comments']]
        sentimentModelPreds = [predictMovieRating(comments,sentimentModel) for sentimentModel in sentimentModels]
        X.append([*sentimentModelPreds,dset[movie]['Size']])
        y.append(dset[movie]['Rating'])
    return np.array(X),y

In [109]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def ssDset(dset,sentimentModels,test_size=0.33):
    X,y = prepDset(dset,sentimentModels)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size,random_state=42)
    ss = StandardScaler()
    ss.fit(X_train)
    X_train = ss.transform(X_train)
    X_test = ss.transform(X_test)
    return X_train, X_test, np.array(y_train), np.array(y_test)

In [110]:
train_x,test_x,train_y,test_y = ssDset(ucutDset,[nb5,nb6,nb7,nb8])

In [111]:
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(252, 5)
(125, 5)
(252,)
(125,)


In [112]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(2)
XP_train = pf.fit_transform(train_x)
XP_test = pf.fit_transform(test_x)

In [113]:
from sklearn.linear_model import LinearRegression
linModel = LinearRegression()
linModel.fit(XP_train,train_y)
print(linModel.coef_)
print(linModel.intercept_)

[ 0.          0.27147674  0.27147674  0.50860422  0.0812662   0.14178564
  0.00225681  0.00225681 -0.06912912 -0.00695692  0.02613207  0.00225681
 -0.06912912 -0.00695692  0.02613207 -0.02456351  0.02139656  0.03339996
 -0.01770396 -0.07629103 -0.03150295]
7.22668135522254


In [114]:
print(linModel.score(XP_train,train_y))
print(linModel.score(XP_test,test_y))

0.7057250896052782
0.6983183202331811


In [115]:
from sklearn.tree import DecisionTreeRegressor
decRegModel = DecisionTreeRegressor(max_depth=2)
decRegModel.fit(train_x,train_y)
print(decRegModel.score(train_x,train_y))
print(decRegModel.score(test_x,test_y))

0.6311355777150343
0.45760800137823077


In [116]:
from sklearn.svm import SVR
svrModel = SVR()
svrModel.fit(train_x,train_y)
print(svrModel.score(train_x,train_y))
print(svrModel.score(test_x,test_y))

0.7353500707735305
0.6356149268763553


In [117]:
from sklearn.ensemble import RandomForestRegressor
rfModel = RandomForestRegressor(max_depth=4)
rfModel.fit(train_x,train_y)
print(rfModel.score(train_x,train_y))
print(rfModel.score(test_x,test_y))

0.798841076739281
0.6016305033574227
