In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sqlalchemy import create_engine
import numpy as np

In [None]:
engine = create_engine('postgresql://postgres:password@localhost:5432/postgres')

student_data= pd.read_csv('data.csv',delimiter=';')
student_data.to_sql('students', con=engine, if_exists='replace', index=False)

In [None]:
def add_student(new_student):
    new_student_df = pd.DataFrame(new_student)
    new_student_df.to_sql('students', con=engine, if_exists='append', index=False)

In [None]:
new_student_dict = {
    'school': 'GP',
    'sex': 'F',
    'age': 17,
    'address': 'U',
    'famsize': 'GT3',
    'Pstatus': 'T',
    'Medu': 4,
    'Fedu': 3,
    'Mjob': 'health',
    'Fjob': 'teacher',
    'reason': 'home',
    'guardian': 'mother',
    'traveltime': 2,
    'studytime': 3,
    'failures': 0,
    'schoolsup': 'no',
    'famsup': 'yes',
    'paid': 'no',
    'activities': 'yes',
    'nursery': 'yes',
    'higher': 'yes',
    'internet': 'yes',
    'romantic': 'no',
    'famrel': 5,
    'freetime': 4,
    'goout': 3,
    'Dalc': 2,
    'Walc': 3,
    'health': 4,
    'absences': 1,
    'G1': 15,
    'G2': 14,
    'G3': 16
}
add_student([new_student_dict])

In [5]:
student_df = pd.read_csv("data.csv",delimiter=';')
print(student_df)

    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   16       U     LE3       T     4     3   teacher  services   
1       GP   M   18       U     LE3       T     1     1     other     other   
2       GP   M   17       R     LE3       A     4     4   teacher     other   
3       GP   F   15       U     LE3       T     3     2  services     other   
4       GP   M   16       U     GT3       T     2     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
311     GP   M   15       U     LE3       A     2     1  services     other   
312     GP   F   17       U     GT3       A     4     4     other   teacher   
313     GP   F   15       U     GT3       T     4     4   teacher   teacher   
314     MS   F   19       R     GT3       T     2     3  services     other   
315     GP   F   16       U     GT3       T     4     3     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health 

In [None]:
student_df.isnull().sum()

In [None]:
student_df.info()

In [None]:
categorical_features=student_df.select_dtypes(include=['object']).columns
categorical_features

In [None]:
num_featur=student_df.select_dtypes(exclude=['object'])
num_featur

In [None]:
# Creating a correlation matrix for selected columns
correlation_matrix_selected = num_featur.corr()

# Plot heatmap for selected columns
plt.figure(figsize=(16, 8))
sns.heatmap(correlation_matrix_selected, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix for Selected Columns')
plt.show()

In [None]:
data_encoded = pd.get_dummies(student_df, drop_first=True)
print(data_encoded.info)
print(data_encoded.shape)
# Correlation matrix
plt.figure(figsize=(16, 12))
correlation_matrix = data_encoded.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Select columns for correlation matrix
selected_columns = ['G1', 'G2', 'G3', 'studytime', 'failures', 'absences']
selected_data = student_df[selected_columns]
# Creating a correlation matrix for selected columns
correlation_matrix_selected = selected_data .corr()

# Plot heatmap for selected columns
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_selected, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix for Selected Columns')
plt.show()


In [None]:
data_encoded .hist(bins = 25, figsize = (15,10)) 

In [None]:
X =data_encoded.copy()
y = X.pop("G3")


In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)
print(mi_scores)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores,color='salmon')
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(16, 10))
plot_mi_scores(mi_scores)

In [13]:

def evaluate_model(X_test, y_test, model):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mean_absolute_error_s = mean_absolute_error(y_test, y_pred)
    evaluation_results = {
        'Model': model,
        'R2 Score': r2,
        'Mean Squared Error': mse,
        'Mean Absolute Error': mean_absolute_error_s
    }
   
    df_result = pd.DataFrame([evaluation_results])
    
    return df_result    
def visualize_evaluation(df_result):
    # Extract model names and evaluation metrics
    models = df_result['Model']
    r2_scores = df_result['R2 Score']
    mse_values = df_result['Mean Squared Error']
    mae_values = df_result['Mean Absolute Error']

    # Create a bar plot
    fig, ax = plt.subplots(figsize=(10, 6))
    width = 0.2

    ax.bar(models, r2_scores, width, label='R2 Score')
    ax.bar(models, mse_values, width, label='Mean Squared Error', bottom=r2_scores)
    ax.bar(models, mae_values, width, label='Mean Absolute Error', bottom=r2_scores + mse_values)

    ax.set_ylabel('Metrics')
    ax.set_title('Model Evaluation Metrics')
    ax.legend()

    plt.show()




def tune_hyperparameters(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    results_df = pd.DataFrame(grid_search.cv_results_)
    param_results = {
        'best_params': best_params,
        'best_model': best_model,
        'best_score': best_score,
        'result': results_df
    }
    return param_results

In [17]:
def data_process(data, models):
    X = data.drop("G3", axis=1)
    y = data["G3"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_preprocessed = pd.get_dummies(X_train, drop_first=True)
    X_test_preprocessed = pd.get_dummies(X_test, drop_first=True)
    
    tuned_results = []
    untuned_results = []
    for model_name, model in models.items():
        print(param_grids[model_name])
        tuning_results = tune_hyperparameters(model, param_grids[model_name], X_train_preprocessed, y_train)
        best_params, best_model, best_score,  results_df = tuning_results['best_params'], tuning_results['best_model'], tuning_results['best_score'],tuning_results[results_df]

        print(f"{model_name} - Best Params: {best_params}, Best Score: {best_score}")
        tuned_results = evaluate_model(X_test_preprocessed, y_test, best_model)
        
     
        print(f"{model_name} - Tuned Results: {tuned_results}")

        # Fit the best model
        model.fit(X_train_preprocessed, y_train)
        untuned_results = evaluate_model(X_test_preprocessed, y_test, model)
        print(f"{model_name} - Untuned Results: {untuned_results}")

    return tuned_results,untuned_results

In [19]:
svr_param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'epsilon': [0.01, 0.1, 0.2],
    'kernel': ['linear', 'rbf']
}
#svr_model = SVR()


ridge_param_grid = {'alpha': [0.1, 1, 10]} #default=1.0
#ridge_model = Ridge()

randomForest_param_grid={
    'n_estimators':[100,200,500],  #default 100
    'criterion':['squared_error','absolute_error'], #default squared_error
    'min_samples_split':[2,3,4,5], #default 2
    'min_samples_leaf':[1,2,4,5], #default=1
    'max_leaf_nodes':[4,10,20,50,None] #default=None
}
#randomForest_model=RandomForestRegressor()
# Dictionary of models
models_selected = {
    'RandomForestRegressor': RandomForestRegressor(),
    'Linear SVR': SVR(),
    'Ridge Regressor': Ridge()
}

# Dictionary of param grids
param_grids = {
    'RandomForestRegressor': randomForest_param_grid, 
    'Linear SVR': svr_param_grid,
    'Ridge Regressor': ridge_param_grid
}

df1 = student_df.copy()
tuned_df1,untuned_df1=data_process(df1, models_selected)
print(tuned_df1,untuned_df1)
selection = ["G1", "G2", "G3"]
df2 = df1[selection].copy()
tuned_df2,untuned_df2=data_process(df2, models_selected)
print(tuned_df2,untuned_df2)

{'n_estimators': [100, 200, 500], 'criterion': ['squared_error', 'absolute_error'], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 4, 5], 'max_leaf_nodes': [4, 10, 20, 50, None]}


UnboundLocalError: cannot access local variable 'results_df' where it is not associated with a value

In [10]:
models = results_df1['Model']
tuned_r2_scores = results_df1['Tuned R2 Score']
untuned_r2_scores = results_df1['Untuned R2 Score']

# Plotting
plt.figure(figsize=(10, 6))
bar_width = 0.35

bar1 = plt.bar(models, tuned_r2_scores, width=bar_width, label='Tuned R2 Score')
bar2 = plt.bar(models, untuned_r2_scores, width=bar_width, label='Untuned R2 Score', alpha=0.7)

plt.xlabel('Models')
plt.ylabel('R2 Score')
plt.title('Tuned and Untuned R2 Scores for Different Models')
plt.legend()

plt.show()

TypeError: 'NoneType' object is not subscriptable