In [0]:
import matplotlib.pyplot as plt
import pandas
import numpy
import seaborn
import sklearn
from sklearn.datasets import make_blobs,make_moons
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model,ensemble
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans,MeanShift,AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.metrics import silhouette_score,calinski_harabaz_score,r2_score,mean_absolute_error
from sklearn.metrics.pairwise import euclidean_distances,cosine_distances,manhattan_distances

In [0]:
#read data
train=pandas.read_csv("/content/drive/My Drive/MontrealLocalTV/data.csv")
train=pandas.DataFrame(train)
test=pandas.read_csv("/content/drive/My Drive/MontrealLocalTV/test.csv")
test=pandas.DataFrame(test)

In [0]:
#delete some columns
train_new=train.drop(columns=["Unnamed: 0","Episode","Start_time","End_time","Name of episode","Temperature in Montreal during episode"])
test_new=test.drop(columns=["Unnamed: 0","Episode","Start_time","End_time","Name of episode","Temperature in Montreal during episode"])

In [0]:
#change object feature to numeric in train and test data
for col in train_new.columns:
  if col != "Market Share_total" and col != "Year" and col != "Length":
    train_new[col] = pandas.factorize(train_new[col])[0]

for col in test_new.columns:
  if col != "Market Share_total" and col != "Year" and col != "Length":
    test_new[col] = pandas.factorize(test_new[col])[0]

In [0]:
#label
label = train_new.pop("Market Share_total")

In [0]:
#the head of data after normal
Standard_Scaler = sklearn.preprocessing.StandardScaler()
train_normalizerd = Standard_Scaler.fit_transform(train_new)
train_normalizerd=pandas.DataFrame(train_normalizerd,columns=train_new.columns, index=train_new.index)
train_normalizerd.head()

Unnamed: 0,Station,Channel Type,Season,Year,Date,Day of week,Length,Name of show,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?
0,-1.704273,-2.576988,-0.996422,-1.838599,-1.74949,-1.49351,0.903778,-0.937988,-1.404569,-0.169323,-0.132593,-0.101491,-0.289909
1,-1.704273,-2.576988,-0.996422,-1.838599,-1.74949,-1.49351,-0.226878,-0.937394,-1.261063,-0.169323,-0.132593,-0.101491,-0.289909
2,-1.704273,-2.576988,-0.996422,-1.838599,-1.74949,-1.49351,-0.226878,-0.9368,-1.117557,-0.169323,-0.132593,-0.101491,-0.289909
3,-1.704273,-2.576988,-0.996422,-1.838599,-1.74949,-1.49351,0.150007,-0.936205,-0.97405,-0.169323,-0.132593,-0.101491,-0.289909
4,-1.704273,-2.576988,-0.996422,-1.838599,-1.74949,-1.49351,-0.226878,-0.935611,-0.830544,-0.169323,-0.132593,-0.101491,-0.289909


In [0]:
#split Train data to train and test
X_train, X_test, Y_train, Y_test=train_test_split(train_normalizerd,label,test_size=0.15, random_state=43)
print("shape of train data : ",X_train.shape)
print("shape of train label : ",Y_train.shape)
print("shape of test data : ",X_test.shape)
print("shape of test label : ",Y_test.shape)


shape of train data :  (524157, 13)
shape of train label :  (524157,)
shape of test data :  (92499, 13)
shape of test label :  (92499,)


In [0]:
#GradientBoostingRegressor
lr=ensemble.GradientBoostingRegressor(n_estimators=200)
model=lr.fit(X_train,Y_train)
print("score on test is : ",model.score(X_test,Y_test))
prediction=model.predict(X_test)
print("MSE on test is : ",mean_squared_error(Y_test,prediction))
print("MAE on test is : ",mean_absolute_error(Y_test,prediction))
print("R2 Score on test is : ",r2_score(Y_test,prediction))

score on test is :  0.7511348482368991
MSE on test is :  5.97436645149373
MAE on test is :  1.46192010368699
R2 Score on test is :  0.7511348482368991


In [0]:
#LinearRegression
LReg=linear_model.LinearRegression()
model=LReg.fit(X_train,Y_train)
print("score on test is : ",model.score(X_test,Y_test))
prediction=model.predict(X_test)
print("MSE on test is : ",mean_squared_error(Y_test,prediction))
print("MAE on test is : ",mean_absolute_error(Y_test,prediction))
print("R2 Score on test is : ",r2_score(Y_test,prediction))

score on test is :  0.3388884069064424
MSE on test is :  15.870936105315124
MAE on test is :  2.325512034305699
R2 Score on test is :  0.3388884069064424


In [0]:
#RandomForestRegressor
RFR=ensemble.RandomForestRegressor()
model=RFR.fit(X_train,Y_train)
print("score on test is : ",model.score(X_test,Y_test))
prediction=model.predict(X_test)
print("MSE on test is : ",mean_squared_error(Y_test,prediction))
print("MAE on test is : ",mean_absolute_error(Y_test,prediction))
print("R2 Score on test is : ",r2_score(Y_test,prediction))

score on test is :  0.8483542560786426
MSE on test is :  3.640474524364072
MAE on test is :  1.1001470130680944
R2 Score on test is :  0.8483542560786426


In [0]:
#Decision Tree
Dtree=DecisionTreeRegressor(max_depth=100)
model=Dtree.fit(X_train,Y_train)
print("score on test is : ",model.score(X_test,Y_test))
prediction=model.predict(X_test)
print("MSE on test is : ",mean_squared_error(Y_test,prediction))
print("MAE on test is : ",mean_absolute_error(Y_test,prediction))
print("R2 Score on test is : ",r2_score(Y_test,prediction))


score on test is :  0.7732154244908568
MSE on test is :  5.444290412053431
MAE on test is :  1.3110434487820375
R2 Score on test is :  0.7732154244908568


In [0]:
#knn=KNeighborsClassifier(n_neighbors=1)
knn=KNeighborsRegressor(n_neighbors=5)
model=knn.fit(X_train,Y_train)
print("score on test is : ",model.score(X_test,Y_test))
prediction=model.predict(X_test)
print("MSE on test is : ",mean_squared_error(Y_test,prediction))
print("MAE on test is : ",mean_absolute_error(Y_test,prediction))
print("R2 Score on test is : ",r2_score(Y_test,prediction))