In [47]:
# importing the necessary libraries
import pandas as pd
import numpy as np
# for data visualization
import plotly_express as px

In [48]:
df = pd.read_csv(r"C:\Users\ADITYA\Downloads\Crush_data.csv")
df

Unnamed: 0,Common_Interests_Score,Text_Frequency,Joke_Laugh_Count,Zodiac_Compatibility_Score,Outfit_Match_Score,Compatibility_Score
0,6,18,4,0,0,28.932584
1,3,30,5,3,1,41.853933
2,10,15,6,9,3,44.382022
3,7,13,20,9,1,55.617978
4,4,47,0,5,10,62.359551
...,...,...,...,...,...,...
1995,7,41,17,4,1,78.651685
1996,8,46,5,0,9,70.224719
1997,3,29,19,9,0,64.887640
1998,0,40,0,5,5,44.943820


**As we can see there are total 6 columns in the dataset, Compatibility_Score is the output feature which we have to predict with the help of the rest of the features**
<br>
1. Common_Interests_Score : A score (0-10) based on how many common hobbies or interests you and your crush share.
<br>
2. Text_Frequency : Average number of texts exchanged per day.
<br>
3. Joke_Laugh_Count : Number of times your crush laughs at your jokes in a week.
<br>
4. Zodiac_Compatibility_Score : Compatibility score based on zodiac signs (0-10)
<br>
5. Outfit_Match_Score : A score (0-10) representing how often your outfits align with their preferences.

In [49]:
# checking that the data contains missing values or not
df.isnull().sum()

Common_Interests_Score        0
Text_Frequency                0
Joke_Laugh_Count              0
Zodiac_Compatibility_Score    0
Outfit_Match_Score            0
Compatibility_Score           0
dtype: int64

In [50]:
# checking for duplicated values
df.duplicated().sum()

0

**The data is clean, it does not contain any missing values and duplicates**

In [51]:
def plot_data(dataframe, column):
    """
    This function takes the dataframe and the column name as input and return
    the histogram plot, descriptive stats and box plot
    
    """
    # plot the histogram with edgecolor of the bars as black
    hist = px.histogram(dataframe, column)
    hist.update_traces(marker_line_color = "black", marker_line_width = 1)
    # calculate the descriptive stats
    stats = dataframe[column].describe()
    # plot the box plot
    box = px.box(dataframe, y = column)
    
    return hist, stats, box

**Stats of Common_Interests_Score**

In [52]:
hist, stats, box = plot_data(df, 'Common_Interests_Score')

In [53]:
hist.show()

In [54]:
stats

count    2000.000000
mean        4.899500
std         3.181896
min         0.000000
25%         2.000000
50%         5.000000
75%         8.000000
max        10.000000
Name: Common_Interests_Score, dtype: float64

In [55]:
# checking for outliers
box

**Stats of Text_Frequency**

In [56]:
hist, stats, box = plot_data(df, 'Text_Frequency')

In [57]:
hist.show()

In [58]:
stats

count    2000.000000
mean       25.002500
std        14.565115
min         0.000000
25%        13.000000
50%        25.000000
75%        37.000000
max        50.000000
Name: Text_Frequency, dtype: float64

In [59]:
box

**Stats of Joke_Laugh_Count**

In [60]:
hist, stats, box = plot_data(df, 'Joke_Laugh_Count')

In [61]:
hist

In [62]:
stats

count    2000.00000
mean       10.25500
std         5.95934
min         0.00000
25%         5.00000
50%        10.00000
75%        15.00000
max        20.00000
Name: Joke_Laugh_Count, dtype: float64

In [63]:
box

**Stats of Zodiac_Compatibility_Score**

In [64]:
hist, stats, box = plot_data(df, 'Zodiac_Compatibility_Score')

In [65]:
hist

In [66]:
stats

count    2000.000000
mean        5.006500
std         3.168513
min         0.000000
25%         2.000000
50%         5.000000
75%         8.000000
max        10.000000
Name: Zodiac_Compatibility_Score, dtype: float64

In [67]:
box

**Stats of Outfit_Match_Score**

In [68]:
hist, stats, box = plot_data(df, 'Outfit_Match_Score')

In [69]:
hist

In [70]:
stats

count    2000.000000
mean        4.988500
std         3.240311
min         0.000000
25%         2.000000
50%         5.000000
75%         8.000000
max        10.000000
Name: Outfit_Match_Score, dtype: float64

In [71]:
box

**Stats of Compatibility_Score**

In [72]:
hist, stats, box = plot_data(df, 'Compatibility_Score')

In [73]:
hist

In [74]:
stats

count    2000.000000
mean       50.752388
std        19.747691
min         0.000000
25%        36.235955
50%        50.842697
75%        65.730337
max       100.000000
Name: Compatibility_Score, dtype: float64

**As we can see no outliers is present in the data, that is plus point and the distribution of input feature column is nearly of uniform distribution and the output feature shows a normal distribution**

**Correlation**

In [75]:
df.corr()['Compatibility_Score']

Common_Interests_Score        0.244407
Text_Frequency                0.851382
Joke_Laugh_Count              0.475757
Zodiac_Compatibility_Score    0.141843
Outfit_Match_Score            0.078625
Compatibility_Score           1.000000
Name: Compatibility_Score, dtype: float64

**The output feature is more correlated with Text_Frequency and Joke_Laugh_Count input feature**

In [76]:
# extract the input features and output feature
X, y = df[df.columns[:5]].values, df['Compatibility_Score'].values

In [77]:
X

array([[ 6, 18,  4,  0,  0],
       [ 3, 30,  5,  3,  1],
       [10, 15,  6,  9,  3],
       ...,
       [ 3, 29, 19,  9,  0],
       [ 0, 40,  0,  5,  5],
       [ 0, 33,  0,  7,  7]], dtype=int64)

In [78]:
y

array([28.93258427, 41.85393258, 44.38202247, ..., 64.88764045,
       44.94382022, 39.88764045])

In [79]:
# perform train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
len(X_train)

1600

In [81]:
len(X_test)

400

In [82]:
# implementing Muliple Linear Regression
class Multiple_Linear_Regression:
    def __init__(self, coef_ = None, intercept_=None):
        self.coef_ = coef_
        self.intercept_ = intercept_
    def fit(self, X_train, y_train):
        X_train = np.insert(X_train, 0, 1, axis = 1)
        betas = np.dot(np.linalg.inv(np.dot(X_train.T, X_train)), np.dot(X_train.T, y_train))
        self.coef_ = betas[1:]
        self.intercept_ = betas[0]
    def predict(self, X_test):
        return  np.dot(X_test, self.coef_) + self.intercept_
    def get_params(self, deep):
        return {"coef_" : self.coef_, "intercept_" : self.intercept_}

In [83]:
mlr = Multiple_Linear_Regression()

In [84]:
mlr.fit(X_train, y_train)

In [85]:
y_pred = mlr.predict(X_test)

In [86]:
# evaluating the model
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = root_mean_squared_error(y_test, y_pred) ** 2
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
adj_r2 = ((1-r2)*(2000-1)) / (2000-1-5)

print(f"MAE : {mae}")
print(f"MSE : {mse}")
print(f"RMSE : {rmse}")
print(f"R2 Score : {r2}")
print(f"Adjusted R2 Score : {r2}")


MAE : 1.4636736267448213e-13
MSE : 2.3824790517641442e-26
RMSE : 1.54352811822919e-13
R2 Score : 1.0
Adjusted R2 Score : 1.0


In [87]:
# implementing sklearn Linear Regression class
from sklearn.linear_model import LinearRegression

# creating an object of LinearRegression class
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_sk = lr.predict(X_test)


In [88]:
mae = mean_absolute_error(y_test, y_pred_sk)
mse = root_mean_squared_error(y_test, y_pred_sk) ** 2
rmse = root_mean_squared_error(y_test, y_pred_sk)
r2 = r2_score(y_test, y_pred_sk)
adj_r2 = ((1-r2)*(2000-1)) / (2000-1-5)

print(f"MAE : {mae}")
print(f"MSE : {mse}")
print(f"RMSE : {rmse}")
print(f"R2 Score : {r2}")
print(f"Adjusted R2 Score : {r2}")

MAE : 1.2512213487525515e-14
MSE : 2.526780643990792e-28
RMSE : 1.589585054028501e-14
R2 Score : 1.0
Adjusted R2 Score : 1.0


In [89]:
print(mlr.coef_)
print(mlr.intercept_)

[1.68539326 1.12359551 1.40449438 0.84269663 0.56179775]
-7.022471910112245


In [90]:
print(lr.coef_)
print(lr.intercept_)

[1.68539326 1.12359551 1.40449438 0.84269663 0.56179775]
-7.022471910112337


**Cross validation**

In [91]:
# performing cross validation of both Multiple_Linear_Regression and of sklearn Linear Regression class
from sklearn.model_selection import cross_val_score
cv_score = np.mean(cross_val_score(mlr, X, y, cv=10, scoring="r2"))
cv_score_sk = np.mean(cross_val_score(lr, X, y, cv=10, scoring="r2"))

print(f"CV score of Multiple Linear Regression : {cv_score}")
print(f"CV score of Sklearn Linear Regression : {cv_score_sk}")

CV score of Multiple Linear Regression : 1.0
CV score of Sklearn Linear Regression : 1.0


**Both Multiple_Linear_Regression and scikit-learn Linear Regression class had performed well, we will serialize the sklearn Linear Regression class object for deployment**

In [92]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(lr, file)
