<a href="https://colab.research.google.com/github/aumair472/machine_learning_session_1/blob/main/machine_learning_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Load the Data

In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/DataSet/Student_Performance.csv')

print('Print First Five Row of the Data Set: \n')
print(df.head())

Print First Five Row of the Data Set: 

   Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7               99                        Yes            9   
1              4               82                         No            4   
2              8               51                        Yes            7   
3              5               52                        Yes            5   
4              7               75                         No            8   

   Sample Question Papers Practiced  Performance Index  
0                                 1               91.0  
1                                 2               65.0  
2                                 2               45.0  
3                                 2               36.0  
4                                 5               66.0  


In [4]:
dependent_variable = ' Performance Index'

independent_variables = [col for col in df.columns if col != dependent_variable]

print('Dependent Variable: \n', dependent_variable)
print('Independent Variables: \n', independent_variables)

print('\n Missing values in the dataset:')
print(df.isnull().sum())



Dependent Variable: 
  Performance Index
Independent Variables: 
 ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']

 Missing values in the dataset:
Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64


In [5]:
print("Original DataFrame columns:", df.columns)
print("Data types before encoding:\n", df.dtypes)

# Identify categorical columns for one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns

if len(categorical_cols) > 0:
    print(f"\nCategorical columns to encode: {list(categorical_cols)}")
    # Apply one-hot encoding
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    print("\nDataFrame after one-hot encoding and dropping original categorical columns:")
    print(df_encoded.head())
    df = df_encoded
else:
    print("\nNo categorical columns found for encoding.")

print("\nData types after encoding:\n", df.dtypes)

Original DataFrame columns: Index(['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')
Data types before encoding:
 Hours Studied                         int64
Previous Scores                       int64
Extracurricular Activities           object
Sleep Hours                           int64
Sample Question Papers Practiced      int64
Performance Index                   float64
dtype: object

Categorical columns to encode: ['Extracurricular Activities']

DataFrame after one-hot encoding and dropping original categorical columns:
   Hours Studied  Previous Scores  Sleep Hours  \
0              7               99            9   
1              4               82            4   
2              8               51            7   
3              5               52            5   
4              7               75            8   

   Sample Question Papers Practiced  Performance 

In [6]:
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# 1. Separate features (X) and target (y)
y = df['Performance Index']
X = df.drop('Performance Index', axis=1)

print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# 3. Check for multicollinearity using VIF
# Ensure all boolean columns are converted to int for VIF calculation
X_vif = X.copy()
for col in X_vif.select_dtypes(include='bool').columns:
    X_vif[col] = X_vif[col].astype(int)

X_vif_const = add_constant(X_vif)

vif_data = pd.DataFrame()
vif_data["feature"] = X_vif_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif_const.values, i) for i in range(X_vif_const.shape[1])]

# Drop the constant row as it's not a feature
vif_data = vif_data.drop(vif_data[vif_data['feature'] == 'const'].index)

print("\nVariance Inflation Factor (VIF) for independent variables:")
print(vif_data.sort_values(by='VIF', ascending=False))

# 4. Confirm data types
print("\nData types of features (X) after preprocessing:")
print(X.dtypes)
print("\nData type of target (y):")
print(y.dtypes)

Shape of X (features): (10000, 5)
Shape of y (target): (10000,)

Training set size: 8000 samples
Testing set size: 2000 samples

Variance Inflation Factor (VIF) for independent variables:
                            feature       VIF
5    Extracurricular Activities_Yes  1.000802
3                       Sleep Hours  1.000600
4  Sample Question Papers Practiced  1.000557
1                     Hours Studied  1.000478
2                   Previous Scores  1.000326

Data types of features (X) after preprocessing:
Hours Studied                       int64
Previous Scores                     int64
Sleep Hours                         int64
Sample Question Papers Practiced    int64
Extracurricular Activities_Yes       bool
dtype: object

Data type of target (y):
float64


In [7]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

print("Multiple Linear Regression model trained successfully.")

Multiple Linear Regression model trained successfully.
