In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df_path = '/content/Placement_Data_Full_Class.csv'
df = pd.read_csv(df_path)

In [None]:
print(df)

     sl_no gender  ssc_p    ssc_b  hsc_p    hsc_b     hsc_s  degree_p  \
0        1      M  67.00   Others  91.00   Others  Commerce     58.00   
1        2      M  79.33  Central  78.33   Others   Science     77.48   
2        3      M  65.00  Central  68.00  Central      Arts     64.00   
3        4      M  56.00  Central  52.00  Central   Science     52.00   
4        5      M  85.80  Central  73.60  Central  Commerce     73.30   
..     ...    ...    ...      ...    ...      ...       ...       ...   
210    211      M  80.60   Others  82.00   Others  Commerce     77.60   
211    212      M  58.00   Others  60.00   Others   Science     72.00   
212    213      M  67.00   Others  67.00   Others  Commerce     73.00   
213    214      F  74.00   Others  66.00   Others  Commerce     58.00   
214    215      M  62.00  Central  58.00   Others   Science     53.00   

      degree_t workex  etest_p specialisation  mba_p      status    salary  
0     Sci&Tech     No     55.0         Mkt&HR 

In [3]:
columns_with_nan = df.columns[df.isnull().any()]
print(columns_with_nan)

Index(['salary'], dtype='object')


Statistical Tests

In [4]:

features = ['ssc_p','hsc_p','hsc_s', 'degree_p', 'degree_t', 'workex', 'specialisation']
target = 'salary'
data = df.dropna(subset=[target])
X = data[features]
y = data[target]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
import scipy.stats as stats
import statsmodels.api as sm

numerical_features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
categorical_features = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
target = 'salary'

correlation_df = data[numerical_features + [target]].corr()
print("Correlation Matrix:")
print(correlation_df)

# ANOVA Test for categorical variables
for feature in categorical_features:
    grouped_data = data.groupby(feature)[target].apply(list)
    f_stat, p_value = stats.f_oneway(*grouped_data)
    print(f"ANOVA Test for {feature}: F-statistic={f_stat}, p-value={p_value}")



Correlation Matrix:
             ssc_p     hsc_p  degree_p   etest_p     mba_p    salary
ssc_p     1.000000  0.293416  0.380657  0.317892  0.430560  0.035330
hsc_p     0.293416  1.000000  0.221307  0.284672  0.329983  0.076819
degree_p  0.380657  0.221307  1.000000  0.217683  0.494093 -0.019272
etest_p   0.317892  0.284672  0.217683  1.000000  0.284143  0.178307
mba_p     0.430560  0.329983  0.494093  0.284143  1.000000  0.175013
salary    0.035330  0.076819 -0.019272  0.178307  0.175013  1.000000
ANOVA Test for gender: F-statistic=3.7824499302878496, p-value=0.053714668806406095
ANOVA Test for ssc_b: F-statistic=0.0044795333543091705, p-value=0.9467295407875281
ANOVA Test for hsc_b: F-statistic=0.00832142508589112, p-value=0.9274413548272418
ANOVA Test for hsc_s: F-statistic=0.6567251207686856, p-value=0.520082117101212
ANOVA Test for degree_t: F-statistic=2.2241713568391965, p-value=0.11183433736380005
ANOVA Test for workex: F-statistic=2.789346817266034, p-value=0.09703543621755742



The correlation matrix shows the correlation coefficients between numerical features and the salary, as well as the correlation between the salary and itself (which is always 1). Here's how to interpret the correlation matrix:

The correlation coefficient ranges from -1 to 1. A value closer to 1 indicates a strong positive correlation, while a value closer to -1 indicates a strong negative correlation. A value around 0 indicates no linear correlation.
For example, the correlation coefficient between 'ssc_p' (Secondary Education percentage) and 'salary' is 0.035, indicating a very weak positive correlation.
Similarly, the correlation coefficient between 'hsc_p' (Higher Secondary Education percentage) and 'salary' is 0.077, also indicating a very weak positive correlation.
The highest correlation coefficient with 'salary' is seen with 'mba_p' (MBA percentage) at 0.175, suggesting a weak positive correlation.
Regarding the ANOVA tests for categorical variables:

The ANOVA test assesses whether there are statistically significant differences in the mean salary across different categories of a categorical variable.
For instance, the p-value for the ANOVA test for 'gender' is 0.054, suggesting that there might be a significant difference in the mean salary between different genders, but it's not statistically significant at the conventional significance level of 0.05.
Similarly, for 'degree_t' (Under Graduation Degree type), the p-value is 0.112, indicating that there might be some difference in the mean salary between different types of degrees, but it's not statistically significant.
On the other hand, for 'hsc_s' (Specialization in Higher Secondary Education), the p-value is 0.520, suggesting that there is no significant difference in the mean salary across different specializations.

In [None]:
print(X_train)

     ssc_p  hsc_p     hsc_s  degree_p   degree_t workex specialisation
212   67.0   67.0  Commerce     73.00  Comm&Mgmt    Yes        Mkt&Fin
62    86.5   64.2   Science     67.40   Sci&Tech     No        Mkt&Fin
24    76.5   97.7   Science     78.86   Sci&Tech     No        Mkt&Fin
16    63.0   66.2  Commerce     65.60  Comm&Mgmt    Yes        Mkt&Fin
160   87.0   74.0   Science     65.00   Sci&Tech    Yes         Mkt&HR
..     ...    ...       ...       ...        ...    ...            ...
103   73.0   78.0   Science     73.00   Sci&Tech    Yes         Mkt&HR
147   70.0   74.0  Commerce     65.00  Comm&Mgmt     No        Mkt&Fin
22    69.8   60.8   Science     72.23   Sci&Tech     No         Mkt&HR
129   76.7   89.7  Commerce     66.00  Comm&Mgmt    Yes        Mkt&Fin
142   85.0   60.0   Science     73.43   Sci&Tech    Yes        Mkt&Fin

[111 rows x 7 columns]


In [None]:
columns_with_nan = X_train.columns[X_train.isnull().any()]
print(columns_with_nan)

Index([], dtype='object')


In [6]:
numeric_features = ['ssc_p', 'hsc_p', 'degree_p']
categorical_features = ['hsc_s', 'degree_t', 'workex', 'specialisation']

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', SimpleImputer(strategy='mean'), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ])

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)



In [None]:
# # Preprocessing for numerical features
# numeric_features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
# X_train[numeric_features] = X_train[numeric_features].fillna(X_train[numeric_features].mean())
# X_test[numeric_features] = X_test[numeric_features].fillna(X_train[numeric_features].mean())  # Impute with mean of training set

# # # Replace missing values in 'salary' with 0
# # X_train['salary'].fillna(0, inplace=True)
# # X_test['salary'].fillna(0, inplace=True)

# # Preprocessing for categorical features
# categorical_features = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
# categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
# X_train_categorical = categorical_transformer.fit_transform(X_train[categorical_features])
# X_test_categorical = categorical_transformer.transform(X_test[categorical_features])

# # Concatenate the transformed features
# X_train_transformed = pd.concat([X_train[numeric_features], pd.DataFrame(X_train_categorical)], axis=1)
# X_test_transformed = pd.concat([X_test[numeric_features], pd.DataFrame(X_test_categorical)], axis=1)

# # # Convert feature names to strings
# # X_train_transformed.columns = X_train_transformed.columns.astype(str)
# # X_test_transformed.columns = X_test_transformed.columns.astype(str)



In [8]:
print(X_train_transformed)

[[67.   67.   73.   ...  1.    1.    0.  ]
 [86.5  64.2  67.4  ...  0.    1.    0.  ]
 [76.5  97.7  78.86 ...  0.    1.    0.  ]
 ...
 [69.8  60.8  72.23 ...  0.    0.    1.  ]
 [76.7  89.7  66.   ...  1.    1.    0.  ]
 [85.   60.   73.43 ...  1.    1.    0.  ]]


Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
model = RandomForestRegressor(random_state=42)

model = GradientBoostingRegressor(random_state=42)

# Train the model
model.fit(X_train_transformed, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_transformed)

# Evaluate the model performance using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = (abs(y_test - y_pred) / y_test).mean() * 100
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Percentage Error (MAPE): 20.74844900308346
Mean Squared Error (MSE): 7805896781.485537
Mean Absolute Error (MAE): 62478.61123363227


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train_transformed, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_transformed)

# Evaluate the model performance using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = (abs(y_test - y_pred) / y_test).mean() * 100
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Percentage Error (MAPE): 20.963986355397783
Mean Squared Error (MSE): 9330525848.64865
Mean Absolute Error (MAE): 63198.91891891892


In [17]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
model = SVR()

# Train the model
model.fit(X_train_transformed, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_transformed)

# Evaluate the model performance using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = (abs(y_test - y_pred) / y_test).mean() * 100
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Percentage Error (MAPE): 14.882464063187587
Mean Squared Error (MSE): 7673972491.893243
Mean Absolute Error (MAE): 51243.17619829537


In [19]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
model = KernelRidge()

# Train the model
model.fit(X_train_transformed, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_transformed)

# Evaluate the model performance using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = (abs(y_test - y_pred) / y_test).mean() * 100
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Percentage Error (MAPE): 19.186334497582916
Mean Squared Error (MSE): 6891132844.265975
Mean Absolute Error (MAE): 58587.07704763477


In [20]:
from sklearn.linear_model import BayesianRidge

model = BayesianRidge()

# Train the model
model.fit(X_train_transformed, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_transformed)

# Evaluate the model performance using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = (abs(y_test - y_pred) / y_test).mean() * 100
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Percentage Error (MAPE): 18.985872123526242
Mean Squared Error (MSE): 7063259767.185137
Mean Absolute Error (MAE): 58754.079707451165
