In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import math

In [2]:
raw_data = pd.read_excel('./CSM_dataset.xlsx')
# print(raw_data)
# print(raw_data.to_markdown())
# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        ):
#     print(raw_data)
print("Before Preprocessing:\n",raw_data.isna().sum())

isna_cols = raw_data.isna().sum()
# print(isna_cols.index)
# taking care of missing values
cols = raw_data.columns
for col in cols:
    if isna_cols[col] :
        _mean = raw_data[col].mean()
        raw_data[col] = raw_data[col].replace(math.nan, _mean)


print("After Preprocessing:\n",raw_data.isna().sum())
# removing "Movie" and "Year" from the dataset 

# print(cols.drop(["Movie"]))
prep_cols = cols.drop(["Movie", 'Year'])
# print(cols)
prep_data = raw_data.drop(['Movie', 'Year'], axis=1)

# since "Movie" name and "Year" do not effect the "Gross" (target variabl) we removed it.

# Normalization using MinMaxScaler

min_max_scaler = MinMaxScaler()
x = prep_data[prep_cols].values
x_scaled = min_max_scaler.fit_transform(x)

normalized = pd.DataFrame(x_scaled, columns=prep_cols, index=prep_data.index)

prep_data[prep_cols] = normalized
print(prep_data)
# splitting the dataset
Y = np.array(prep_data.pop("Gross"))
X = np.array(prep_data)

print(X.shape, Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.2)
print(X_test.shape, Y_test.shape)

# raw_data

Before Preprocessing:
 Movie                   0
Year                    0
Ratings                 0
Genre                   0
Gross                   0
Budget                  1
Screens                10
Sequel                  0
Sentiment               0
Views                   0
Likes                   0
Dislikes                0
Comments                0
Aggregate Followers    35
dtype: int64
After Preprocessing:
 Movie                  0
Year                   0
Ratings                0
Genre                  0
Gross                  0
Budget                 0
Screens                0
Sequel                 0
Sentiment              0
Views                  0
Likes                  0
Dislikes               0
Comments               0
Aggregate Followers    0
dtype: int64
      Ratings     Genre     Gross    Budget   Screens    Sequel  Sentiment  \
0    0.571429  0.500000  0.000010  0.015724  0.009949  0.000000   0.567164   
1    0.714286  0.000000  0.298598  0.199776  0.764461  0.16

In [3]:
# init regression model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

lr = LinearRegression()
lr_gd = SGDRegressor(max_iter=100000, tol=0.000000001, penalty="l2")

In [4]:
# training model
lr.fit(X_train, Y_train)
score = lr.score(X_test, Y_test)
print("LinAlg Method Score: ",score)


lr_gd.fit(X_train, Y_train)
score = lr_gd.score(X_test, Y_test)
print("SGD Method Score: ",score)


LinAlg Method Score:  0.6924044521506334
SGD Method Score:  0.7027738068811267


In [5]:
from sklearn.metrics import mean_squared_error
# evaluating MSE
y_pred_linAlg = lr.predict(X_test)
mse_linAlg = mean_squared_error(Y_test, y_pred_linAlg)
print(f"MSE_LinAlg:{mse_linAlg:.4f}")

y_pred_gd = lr_gd.predict(X_test)
mse_gd = mean_squared_error(Y_test, y_pred_gd)
print(f"MSE_gd:{mse_gd:.4f}")



MSE_LinAlg:0.0035
MSE_gd:0.0034


### __model.score(y_test, y_pred)__ 
    
This function in scikit-learn computes the performance metric for the model on the provided test data. The specific metric it computes depends on the type of estimator (model) you are using.

+ `For Regression Models:`
    
    + __R^2 Score (Coefficient of Determination):__ For regression models, model.score(X_test, y_test) returns the R^2 score, which measures how well the model's predictions match the actual values. The R^2 score can range from -∞ to 1, with a value of 1 indicating perfect predictions.

    $$R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2}$$

+ `For Classification Models:`

    + __Accuracy:__ For classification models, model.score(X_test, y_test) returns the accuracy, which is the proportion of correctly classified samples. The accuracy ranges from 0 to 1, where 1 means all predictions are correct.
