In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import plotly.express as px 
import seaborn as sns 
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import missingno as msno
import warnings 
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import pickle
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk 

## Functions of  Exploration & EDA 

In [None]:
def Exploration(Datafarme):
    shape_data=Datafarme.shape
    info_data=Datafarme.info()
    print('\nthe shape of data is',shape_data,'\nconsist',Datafarme.shape[0],'row','\nconsist',Datafarme.shape[1],'column','\n\n\n',info_data)

In [None]:
def describtion(df,column_name):
    min_value=df[column_name].min()
    max_value=df[column_name].max()
    averge_value=df[column_name].mean()
    print(f'Data in column {column_name} Ranged From {min_value} to {max_value} by averge {averge_value}')
     
    

In [None]:
def plot_all_distplots(df):
    
    num_vars = len(df.columns)
    cols = 3  
    rows = (num_vars + cols - 1) // cols  
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    
    
    axes = axes.flatten()
    
    
    for i, col in enumerate(df.columns):
        sns.histplot(df[col], kde=True, ax=axes[i],color='red')
        axes[i].set_title(f'Distribution of {col}')
    
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
def knowing_nulls_duplicate(df):
    null_value=df.isnull().sum()
    print(null_value)
    print('-'*200)
    null_value_count=df.isnull().sum().sum()
    print(f'Data have {null_value_count} null value')
    print('-'*200)
    print('Duplicate Values :')
    duplicate_value=df.duplicated().sum()
    print(f'Data have {duplicate_value} duplicaed value')
    if duplicate_value > 0:
        print(f'duplicated value is :')
        print(df[df.duplicated()])
    else: 
        print('-'*200)
        
    

In [None]:
def plot_null_value(df):
    bar_nulls=msno.bar(df)
    matrix_nulls=msno.matrix(df)
    return bar_nulls,matrix_nulls
    

In [None]:
def plot_boxplots_with_exam_score(df, target_col='Exam_Score'):

    sns.set(style="whitegrid", palette="muted")
    
    categorical_columns = df.select_dtypes(include=['category', 'object']).columns
    
    num_vars = len(categorical_columns)
    cols = 2  
    rows = (num_vars + cols - 1) // cols  
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 6 * rows))
    fig.suptitle('Boxplot of Exam Score vs Categorical Columns', fontsize=20, fontweight='bold')
    
    axes = axes.flatten()
    
    for i, col in enumerate(categorical_columns):
        sns.boxplot(x=df[col], y=df[target_col], ax=axes[i], palette="Set2")
        
        axes[i].set_title(f'{col} vs {target_col}', fontsize=16, fontweight='bold')
        axes[i].set_xlabel(col, fontsize=12)
        axes[i].set_ylabel(target_col, fontsize=12)
        
    
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(True, linestyle='--', alpha=0.7)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

In [None]:
def knowing_outliers(df):
    # Initialize an empty list to store column names with outliers
    columns_with_outliers = []
    
    # Loop through each column in the DataFrame
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_Bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        # Count the number of outliers in the column
        outliers_count = df[(df[column] < lower_Bound) | (df[column] > upper_bound)].shape[0]
        
        # If outliers are present, add the column name to the list
        if outliers_count > 0:
            columns_with_outliers.append(column)
    
    # Return the list of column names with outliers
    return columns_with_outliers


In [None]:
def treat_outliers(dataFrame,x):
    Q1=dataFrame[x].quantile(0.25)
    Q3=dataFrame[x].quantile(0.75)
    IQR=Q3-Q1
    lower_Bound=Q3-1.5*IQR
    upper_Bound=Q3+1.5*IQR
    z=dataFrame[x].clip(lower_Bound,upper_Bound)
    return z


In [None]:
def linear_regression_model_plotly(df, scaler, model_used):
   
    x = df.drop(columns=['Exam_Score'])
    y = df['Exam_Score']
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2)
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = model_used
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"R² Score: {r2:.3f}")
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=y_test, y=y_pred, 
                             mode='markers', 
                             marker=dict(color='blue', size=10, line=dict(width=1, color='DarkSlateGrey')),
                             name='Predicted vs Actual'))

    fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], 
                             y=[min(y_test), max(y_test)], 
                             mode='lines', 
                             line=dict(color='red', width=2), 
                             name='Perfect Fit'))

    fig.update_layout(title='Actual vs Predicted - Linear Regression',
                      xaxis_title='Actual Values',
                      yaxis_title='Predicted Values',
                      title_font=dict(size=24, family='Arial', color='darkblue'),
                      xaxis=dict(showline=True, linewidth=2, linecolor='black', mirror=True),
                      yaxis=dict(showline=True, linewidth=2, linecolor='black', mirror=True),
                      plot_bgcolor='white',
                      width=800,
                      height=600)

    fig.update_xaxes(tickfont=dict(size=14), gridcolor='lightgrey')
    fig.update_yaxes(tickfont=dict(size=14), gridcolor='lightgrey')
    
    fig.show()
    
    with open('saved_linear_model_new.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
        
    with open('saved_scaler_linear_new.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    
    return model

## Data overview 

In [None]:
df=pd.read_csv(r"C:\Users\Ayman\Downloads\StudentPerformanceFactors.csv")
df

In [None]:
Exploration(df)

In [None]:
for i in df.select_dtypes('object'):
    df[i]=df[i].astype('category')

In [None]:
Exploration(df)

In [None]:
for i in df.select_dtypes('int'):
    describtion(df,i)
    

In [None]:
plot_all_distplots(df.select_dtypes('int'))

In [None]:
plot_boxplots_with_exam_score(df)

## Data cleaning 

In [None]:
knowing_nulls_duplicate(df)

In [None]:
null_bar,null_matrix=plot_null_value(df)

**No duplicate in Data**

## Treat nulls 

In [None]:
for i in df[['Teacher_Quality','Parental_Education_Level','Distance_from_Home']]:
    count=df[i].value_counts().index[0]
    print(f'the most frequent value in the column {i} is {count}')

In [None]:
df['Teacher_Quality'].fillna('Medium',inplace=True)
df['Parental_Education_Level'].fillna('High School',inplace=True)
df['Distance_from_Home'].fillna('Near',inplace=True)

In [None]:
df.isnull().sum().sum()

## knowing outliers 

In [None]:
knowing_outliers(df)

In [None]:
for i in df[['Hours_Studied', 'Tutoring_Sessions', 'Exam_Score']]:
    df[i]=treat_outliers(df,i)

In [None]:
knowing_outliers(df)

**now outliers in `detected`** 

## Machine Learning 

In [None]:
new_df=pd.get_dummies(df, drop_first=True,dtype='int')

In [None]:
new_df

In [None]:
new_df.corr(numeric_only=True).loc['Exam_Score']

In [None]:
scaler=StandardScaler()
model=LinearRegression()
linear_regression_model_plotly(new_df,scaler,model)

In [None]:
new_df.columns

In [None]:
with open('saved_linear_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('saved_scaler_linear.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

In [1]:
import streamlit as st
import pickle
import numpy as np

# Load model and scaler
with open('saved_linear_model_new.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('saved_scaler_linear_new.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

expected_columns = [
    'Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 
    'Tutoring_Sessions', 'Physical_Activity', 'Parental_Involvement_Low',
    'Parental_Involvement_Medium', 'Access_to_Resources_Low', 
    'Access_to_Resources_Medium', 'Extracurricular_Activities_Yes',
    'Motivation_Level_Low', 'Motivation_Level_Medium', 'Internet_Access_Yes', 
    'Family_Income_Low', 'Family_Income_Medium', 'Teacher_Quality_Low',
    'Teacher_Quality_Medium', 'School_Type_Public', 'Peer_Influence_Neutral',
    'Peer_Influence_Positive', 'Learning_Disabilities_Yes',
    'Parental_Education_Level_High School', 'Parental_Education_Level_Postgraduate',
    'Distance_from_Home_Moderate', 'Distance_from_Home_Near', 'Gender_Male'
]

# Streamlit UI
st.title("Student Performance Prediction")
st.image("pexels-pixabay-256490.jpg", use_column_width=True)

st.header("Enter Student Details:")

# Collecting user input
hours_studied = st.number_input("Hours Studied:", min_value=0.0)
attendance = st.number_input("Attendance:", min_value=0.0)
sleep_hours = st.number_input("Sleep Hours:", min_value=0.0)
previous_scores = st.number_input("Previous Scores:", min_value=0.0)
tutoring_sessions = st.number_input("Tutoring Sessions:", min_value=0.0)
physical_activity = st.number_input("Physical Activity:", min_value=0.0)

parental_involvement = st.selectbox("Parental Involvement:", ["Low", "Medium", "High"])
access_to_resources = st.selectbox("Access to Resources:", ["Low", "Medium", "High"])
extracurricular_activities = st.selectbox("Extracurricular Activities:", ["Yes", "No"])
motivation_level = st.selectbox("Motivation Level:", ["Low", "Medium", "High"])
internet_access = st.selectbox("Internet Access:", ["Yes", "No"])
family_income = st.selectbox("Family Income:", ["Low", "Medium", "High"])
teacher_quality = st.selectbox("Teacher Quality:", ["Low", "Medium", "High"])
school_type = st.selectbox("School Type:", ["Public", "Private"])
peer_influence = st.selectbox("Peer Influence:", ["Neutral", "Positive", "Negative"])
learning_disabilities = st.selectbox("Learning Disabilities:", ["Yes", "No"])
parental_education_level = st.selectbox("Parental Education Level:", ["High School", "Undergraduate", "Postgraduate"])
distance_from_home = st.selectbox("Distance from Home:", ["Near", "Moderate", "Far"])
gender = st.selectbox("Gender:", ["Male", "Female"])

# Submit button
if st.button("Submit"):
    try:
        # Convert inputs to features list
        features = [
            hours_studied, attendance, sleep_hours, previous_scores, tutoring_sessions, physical_activity,
            1 if parental_involvement == "Low" else 0,
            1 if parental_involvement == "Medium" else 0,
            1 if access_to_resources == "Low" else 0,
            1 if access_to_resources == "Medium" else 0,
            1 if extracurricular_activities == "Yes" else 0,
            1 if motivation_level == "Low" else 0,
            1 if motivation_level == "Medium" else 0,
            1 if internet_access == "Yes" else 0,
            1 if family_income == "Low" else 0,
            1 if family_income == "Medium" else 0,
            1 if teacher_quality == "Low" else 0,
            1 if teacher_quality == "Medium" else 0,
            1 if school_type == "Public" else 0,
            1 if peer_influence == "Neutral" else 0,
            1 if peer_influence == "Positive" else 0,
            1 if learning_disabilities == "Yes" else 0,
            1 if parental_education_level == "High School" else 0,
            1 if parental_education_level == "Postgraduate" else 0,
            1 if distance_from_home == "Moderate" else 0,
            1 if distance_from_home == "Near" else 0,
            1 if gender == "Male" else 0
        ]

        features_array = np.array([features])

        if features_array.shape[1] != len(expected_columns):
            st.error("Feature count mismatch.")
        else:
            # Scale features and make prediction
            features_scaled = scaler.transform(features_array)
            predicted_exam_score = model.predict(features_scaled)

            st.success(f"Predicted Exam Score: {predicted_exam_score[0]:.2f}")

    except ValueError as e:
        st.error(f"Error: {str(e)}")
    except Exception as e:
        st.error(f"An unexpected error occurred: {str(e)}")


2024-10-01 14:21:36.077 INFO    numexpr.utils: NumExpr defaulting to 8 threads.
2024-10-01 14:21:46.123 
  command:

    streamlit run C:\Users\Ayman\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [None]:
!pip install streamlit