In [14]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from scipy.stats import chi2_contingency


In [15]:
df=pd.read_csv('Student.csv')
print(df)

   S_ID   F_name    L_name  No_Sub    Fees
0  H001   Sylvia    Reehal     1.0  1500.0
1  H002     Sara  Chitkara     NaN  2750.0
2  H003        _     Arora     2.0  2750.0
3  H004  salomon    Sharma     2.0     NaN
4  H005    david         _     1.0  3000.0
5  H006    jonas    joseph     1.0  3000.0
6  H007     adam     moham     2.0  6000.0


In [16]:
df.replace ("_", np.nan, inplace= True)
print(df)

   S_ID   F_name    L_name  No_Sub    Fees
0  H001   Sylvia    Reehal     1.0  1500.0
1  H002     Sara  Chitkara     NaN  2750.0
2  H003      NaN     Arora     2.0  2750.0
3  H004  salomon    Sharma     2.0     NaN
4  H005    david       NaN     1.0  3000.0
5  H006    jonas    joseph     1.0  3000.0
6  H007     adam     moham     2.0  6000.0


In [17]:
df["Fees"] = pd.to_numeric(df["Fees"])
df["Fees"].fillna(df["Fees"].mean(), inplace=True)
print(df)
print(df.isna().sum())  # To confirm there are no missing values left



   S_ID   F_name    L_name  No_Sub         Fees
0  H001   Sylvia    Reehal     1.0  1500.000000
1  H002     Sara  Chitkara     NaN  2750.000000
2  H003      NaN     Arora     2.0  2750.000000
3  H004  salomon    Sharma     2.0  3166.666667
4  H005    david       NaN     1.0  3000.000000
5  H006    jonas    joseph     1.0  3000.000000
6  H007     adam     moham     2.0  6000.000000
S_ID      0
F_name    1
L_name    1
No_Sub    1
Fees      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Fees"].fillna(df["Fees"].mean(), inplace=True)


In [18]:
df["No_Sub"] = pd.to_numeric(df["No_Sub"])
df["No_Sub"].fillna(df["No_Sub"].median(), inplace=True)
print(df)
print(df.isna().sum())  # To confirm there are no missing values left



   S_ID   F_name    L_name  No_Sub         Fees
0  H001   Sylvia    Reehal     1.0  1500.000000
1  H002     Sara  Chitkara     1.5  2750.000000
2  H003      NaN     Arora     2.0  2750.000000
3  H004  salomon    Sharma     2.0  3166.666667
4  H005    david       NaN     1.0  3000.000000
5  H006    jonas    joseph     1.0  3000.000000
6  H007     adam     moham     2.0  6000.000000
S_ID      0
F_name    1
L_name    1
No_Sub    0
Fees      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["No_Sub"].fillna(df["No_Sub"].median(), inplace=True)


In [19]:
# Add the new field 'Distinction' with the provided values
df['Distinction'] = ['Y', 'N', 'Y', 'NA', 'N', 'Y', 'N']
df.head()

Unnamed: 0,S_ID,F_name,L_name,No_Sub,Fees,Distinction
0,H001,Sylvia,Reehal,1.0,1500.0,Y
1,H002,Sara,Chitkara,1.5,2750.0,N
2,H003,,Arora,2.0,2750.0,Y
3,H004,salomon,Sharma,2.0,3166.666667,
4,H005,david,,1.0,3000.0,N


In [21]:
# Convert 'Distinction' values to numeric: Y=1, N=0, NA=np.nan
distinction_map = {'Y': 1, 'N': 0, 'NA': np.nan}
df['Distinction_numeric'] = df['Distinction'].map(distinction_map)

# Compute correlation with No_Sub
correlation = df['No_Sub'].corr(df['Distinction_numeric'])

print(correlation)

-0.18569533817705186


In [25]:
# Drop rows with missing values in the relevant columns
chi_data = df[['No_Sub', 'Distinction']].dropna()
chi_data['No_Sub'] = chi_data['No_Sub'].astype(int).astype(str)

 #Create a contingency table
contingency_table = pd.crosstab(chi_data['No_Sub'], chi_data['Distinction'])
# Apply the Chi-Squared test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
# Print results
print("Chi-Squared Statistic:", chi2_stat)
print("Degrees of Freedom:", dof)
print("P-Value:", p_val)


Chi-Squared Statistic: 1.5555555555555556
Degrees of Freedom: 2
P-Value: 0.4594258240359268


In [27]:
# Drop rows with missing values
filtered_df = df[['No_Sub', 'Fees', 'Distinction_numeric']].dropna()

# Compute correlation
corr_no_sub = filtered_df['No_Sub'].corr(filtered_df['Distinction_numeric'])
corr_fees = filtered_df['Fees'].corr(filtered_df['Distinction_numeric'])

print("Correlation with Distinction:")
print("No_Sub:", corr_no_sub)
print("Fees:", corr_fees)


Correlation with Distinction:
No_Sub: -0.18569533817705186
Fees: -0.5487396847647231


In [28]:
# Group by Distinction and calculate variance
variance_by_group = filtered_df.groupby('Distinction_numeric').agg({'No_Sub': 'var', 'Fees': 'var'})

print("Variance of features by Distinction class:")
print(variance_by_group)

Variance of features by Distinction class:
                       No_Sub          Fees
Distinction_numeric                        
0.0                  0.250000  3.270833e+06
1.0                  0.333333  6.458333e+05
