# Import the libraries

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from tabulate import tabulate
import klib

# Get the datasets

In [13]:
trn_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/test.csv')

In [14]:
trn_data.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [16]:
trn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       700 non-null    float64
 1   y       699 non-null    float64
dtypes: float64(2)
memory usage: 11.1 KB


In [18]:
trn_data.isnull().sum()

Unnamed: 0,0
x,0
y,1


# Clean the data

In [19]:
im = SimpleImputer(strategy='most_frequent')
trn_data['y'] = im.fit_transform(trn_data[['y']])

In [20]:
trn_data.isnull().sum()

Unnamed: 0,0
x,0
y,0


In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       300 non-null    int64  
 1   y       300 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 4.8 KB


In [21]:
test_data.isnull().sum()

Unnamed: 0,0
x,0
y,0


In [26]:
trn_data = klib.data_cleaning(trn_data)
test_data = klib.data_cleaning(test_data)

Shape of cleaned data: (700, 2) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-0.0%)

Shape of cleaned data: (300, 2) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-nan%)



  mem_perc = round(100 * mem_change / data_mem, 2)


In [27]:
trn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       700 non-null    float32
 1   y       700 non-null    float32
dtypes: float32(2)
memory usage: 5.6 KB


In [28]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       300 non-null    int8   
 1   y       300 non-null    float32
dtypes: float32(1), int8(1)
memory usage: 1.6 KB


In [30]:
X_train = trn_data.iloc[:, :-1].values
y_train = trn_data.iloc[:, -1].values

In [31]:
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

# Define the regressor models

In [29]:
lr_reg = LinearRegression()
svr_reg = SVR()

# Scale the datasets

In [32]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fit the models

In [35]:
lr_reg.fit(X_train, y_train)
svr_reg.fit(X_train, y_train)

# Predict the results on the test set

In [36]:
y_pred_lr = lr_reg.predict(X_test)
y_pred_svr = svr_reg.predict(X_test)

# Calculate the metrics

In [38]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_svr = mean_squared_error(y_test, y_pred_svr)

In [40]:
r2_lr = r2_score(y_test, y_pred_lr)
r2_svr = r2_score(y_test, y_pred_svr)

In [39]:
print(mse_lr)
print(mse_svr)

793.6397289460517
18.209171527577574


In [41]:
print(r2_lr)
print(r2_svr)

0.05780854329155183
0.978382476051437


# Compare the results

In [44]:
headers = ['Model', 'MSE', 'R2']
rows = [['Logistic Regression', 793.63, 0.05],
        ['Support Vector Regressor', 18.2, 0.97]]

print(tabulate(rows, headers, tablefmt='fancy_grid'))

╒══════════════════════════╤════════╤══════╕
│ Model                    │    MSE │   R2 │
╞══════════════════════════╪════════╪══════╡
│ Logistic Regression      │ 793.63 │ 0.05 │
├──────────────────────────┼────────┼──────┤
│ Support Vector Regressor │  18.2  │ 0.97 │
╘══════════════════════════╧════════╧══════╛
