# linear regression model with L2 regularization 

In [36]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import Ridge
from sklearn.metrics import plot_confusion_matrix
data = pd.read_csv("E:\Major project\Placement_Data_Full_Class.csv")

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [38]:
#Preprocessing
def binary_encode(df, column_dict):
    df = df.copy()
    for column, positive_value in column_dict.items():
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [39]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Encode categorical features
    binary_feature_dict = {
        'gender': 'M',
        'ssc_b': 'Central',
        'hsc_b': 'Central',
        'workex': 'Yes',
        'specialisation': 'Mkt&Fin',
        'status': 'Placed'
    }
    
    nominal_feature_dict = {
        'hsc_s': 'hsc',
        'degree_t': 'deg'
    }
    df = binary_encode(df, binary_feature_dict)
    df = onehot_encode(df, nominal_feature_dict)
    
    # Split missing salary data from df and save it for later
    missing_salaries = df[df.isna().sum(axis=1) > 0]
    missing_salaries = missing_salaries.drop('salary', axis=1)
    
    missing_salary_ids = missing_salaries['sl_no'].reset_index(drop=True).copy()
    
    df = df.drop(missing_salaries.index, axis=0).reset_index(drop=True)
    
    
    # Drop sl_no column
    df = df.drop('sl_no', axis=1)
    missing_salaries = missing_salaries.drop('sl_no', axis=1)
    
    # Split df into X and y
    y = df['salary'].copy()
    X = df.drop('salary', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    missing_salaries = scaler.transform(missing_salaries)
    return X_train, X_test, y_train, y_test, missing_salaries, missing_salary_ids
    

In [40]:
X_train, X_test, y_train, y_test, missing_salaries, missing_salary_ids = preprocess_inputs(data)

# Training/Results


In [41]:
model = Ridge(alpha=100.0)
model.fit(X_train, y_train)

model_r2 = model.score(X_test, y_test)

print("Model R^2: {:.5f}".format(model_r2))

Model R^2: 0.03140


In [42]:
missing_salary_predictions = pd.Series(model.predict(missing_salaries), name='salary')

print("Potential salaries for missing target values:")
pd.concat([missing_salary_ids, missing_salary_predictions], axis=1)

Potential salaries for missing target values:


Unnamed: 0,sl_no,salary
0,4,300029.076252
1,6,295397.270069
2,7,262684.013410
3,10,273913.473268
4,13,256581.522217
...,...,...
62,199,314834.883618
63,202,274553.852116
64,207,294912.906217
65,209,269195.020570
