In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

file_path = '/home/bhikrant07/Desktop/AI/KU_STUDENT_DATA_ON_CAMPUS_PLACEMENT.csv'
data = pd.read_csv(file_path)


print(data.head(1))

  Branch               Cepo  Program  End term exam SGPA - 1st semester   \
0    CSE  Currently enrolled  B.Tech                                 6.5   

   End term exam SGPA - 2nd semester  End term exam SGPA - 3rd semester  \
0                                7.2                                6.1   

   End term exam SGPA - 4th semester  End term exam SGPA - 5th semester  \
0                                8.2                                6.8   

   End term exam SGPA - 6th semester  End term exam SGPA - 7th semester  ...  \
0                                6.6                                6.3  ...   

   EDU_LN  SCHL_RCV  URB_RUR INT_CONN How many hrs you study after school?  \
0     Yes        No    Urban     Good                                    1   

   How many value added program you have entered? (coursera/ AWS/IBM etc)  \
0                                                  2                        

   SPOR_PSN  COC_PART COC_PART_ROLE Cam_plc  
0       Yes       yes   Te

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Convert all text in object columns to lowercase
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

# Encode categorical variables
categorical_features = data.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse=False,drop ='first')
encoded_categorical_data = encoder.fit_transform(data[categorical_features])


# Normalize numerical features
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(data[numerical_features])

# Combine encoded and scaled data
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_features))
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_features)
processed_data = pd.concat([encoded_categorical_df, scaled_numerical_df], axis=1)



In [3]:
print(processed_data.head(1))

   Branch_cse  Branch_ece  Branch_ee  Branch_me  Branch_mscit  \
0         1.0         0.0        0.0        0.0           0.0   

   Cepo _passed out  Program_mscit  C_X_B_state board  C_XII_B_state board  \
0               0.0            0.0                1.0                  1.0   

   M_F_male  ...  End term exam SGPA - 6th semester  \
0       1.0  ...                           -0.93521   

   End term exam SGPA - 7th semester  End term exam SGPA - 8th semester  \
0                          -0.674564                          -0.704771   

   CGPA after 8th semester  Class X grade  Class XII grade  \
0                -0.759161       0.222934        -0.335895   

   Overall Attendance percentage  Number of internships during undergraduate.  \
0                       0.158114                                     0.192524   

   How many hrs you study after school?  \
0                             -0.774139   

   How many value added program you have entered? (coursera/ AWS/IBM etc)  

In [4]:
print(processed_data.columns)

Index(['Branch_cse', 'Branch_ece', 'Branch_ee', 'Branch_me', 'Branch_mscit',
       'Cepo _passed out', 'Program_mscit', 'C_X_B_state board',
       'C_XII_B_state board', 'M_F_male', 'C_HLTH_good', 'C_HLTH_poor',
       'FAM_TYPE_nuclear family', 'EDU_LN_yes', 'SCHL_RCV_yes',
       'URB_RUR_urban', 'INT_CONN_poor', 'SPOR_PSN_yes', 'COC_PART_yes',
       'COC_PART_ROLE_volunteer', 'Cam_plc_yes',
       'End term exam SGPA - 1st semester ',
       'End term exam SGPA - 2nd semester',
       'End term exam SGPA - 3rd semester',
       'End term exam SGPA - 4th semester',
       'End term exam SGPA - 5th semester',
       'End term exam SGPA - 6th semester',
       'End term exam SGPA - 7th semester',
       'End term exam SGPA - 8th semester', 'CGPA after 8th semester',
       'Class X grade', 'Class XII grade', 'Overall Attendance percentage',
       'Number of internships during undergraduate.',
       'How many hrs you study after school?',
       'How many value added program you ha

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

# Define target variables
X_cgpa = processed_data.drop(['CGPA after 8th semester'], axis=1)
y_cgpa = processed_data['CGPA after 8th semester']

X_placement = processed_data.drop(['Cam_plc_yes'], axis=1)
y_placement = processed_data['Cam_plc_yes']
# .apply(lambda x: 1 if x == 'yes' else 0)
# Split the data
X_train_cgpa, X_test_cgpa, y_train_cgpa, y_test_cgpa = train_test_split(X_cgpa, y_cgpa, test_size=0.2, random_state=42)

# Split the data for placement prediction
X_train_placement, X_test_placement, y_train_placement, y_test_placement = train_test_split(X_placement, y_placement, test_size=0.2, random_state=42)

# Reshape data for LSTM
X_train_cgpa_reshaped = np.reshape(X_train_cgpa.values, (X_train_cgpa.shape[0], X_train_cgpa.shape[1], 1))
X_test_cgpa_reshaped = np.reshape(X_test_cgpa.values, (X_test_cgpa.shape[0], X_test_cgpa.shape[1], 1))

X_train_placement_reshaped = np.reshape(X_train_placement.values, (X_train_placement.shape[0], X_train_placement.shape[1], 1))
X_test_placement_reshaped = np.reshape(X_test_placement.values, (X_test_placement.shape[0], X_test_placement.shape[1], 1))


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam


# LSTM model for CGPA prediction
def create_cgpa_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50, return_sequences=True))
    # model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1, activation='linear'))    
    
    optimizer = Adam(learning_rate=0.01)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])
    return model

# LSTM model for placement prediction
def create_placement_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1, activation='sigmoid'))  
    
    optimizer = Adam(learning_rate=0.009)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


2024-06-30 23:18:51.153790: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-30 23:18:51.179987: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# Train & evaluate 
model_cgpa = create_cgpa_lstm_model((X_train_cgpa_reshaped.shape[1], X_train_cgpa_reshaped.shape[2]))
                                           #no of timesteps               no of features per timestep
model_cgpa.fit(X_train_cgpa_reshaped, y_train_cgpa, epochs=10, batch_size=32, verbose=1)
loss_cgpa, mse_cgpa = model_cgpa.evaluate(X_test_cgpa_reshaped, y_test_cgpa, verbose=0)
print('MSE for CGPA:', mse_cgpa)


model_placement = create_placement_lstm_model((X_train_placement_reshaped.shape[1], X_train_placement_reshaped.shape[2]))
model_placement.fit(X_train_placement_reshaped, y_train_placement, epochs=10, batch_size=32, verbose=1)
loss_placement, acc_placement = model_placement.evaluate(X_test_placement_reshaped, y_test_placement, verbose=0)
print('Accuracy for Placement:', acc_placement)

Epoch 1/10


2024-06-30 23:18:52.586190: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super().__init__(**kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 1.1554 - mse: 1.1554
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.6543 - mse: 0.6543
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.3587 - mse: 0.3587
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.3443 - mse: 0.3443
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.3889 - mse: 0.3889
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.3328 - mse: 0.3328
Epoch 7/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.3189 - mse: 0.3189
Epoch 8/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 0.2240 - mse: 0.2240
Epoch 9/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.2952

  super().__init__(**kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.5667 - loss: 0.6980
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.5451 - loss: 0.6937
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.4977 - loss: 0.6945
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5371 - loss: 0.6927
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5115 - loss: 0.6922
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5130 - loss: 0.6947
Epoch 7/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.4925 - loss: 0.6942
Epoch 8/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.4903 - loss: 0.6927
Epoch 9/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [8]:
# # In the context of using LSTM (Long Short-Term Memory) models for sequence prediction or classification tasks, reshaping the input data is crucial to match the expected format by the LSTM layer. Let's break down the specifics:

# ### Why Reshape for LSTM Models?

# 1. **Input Shape Requirement**:
#    - LSTM layers in Keras/TensorFlow expect input data in a specific format: `(batch_size, timesteps, input_dim)`.
#    - `batch_size`: Number of samples in each batch of data.
#    - `timesteps`: Number of time steps or sequence length in each sample.
#    - `input_dim`: Number of features (or dimensions) in each time step.

# 2. **Reshaping Purpose**:
#    - The original shape of your input data might be `(number_of_samples, number_of_features)`. For LSTM models, especially when dealing with sequential data (like time series or text sequences), you reshape it to `(number_of_samples, number_of_timesteps, input_dim)`.

# 3. **Specifics of Reshaping**:
#    - `X_train_placement.values`: This likely refers to your training data values, typically a DataFrame or array.
#    - `X_train_placement.shape[0]`: Refers to the number of samples (rows) in your data.
#    - `X_train_placement.shape[1]`: Refers to the number of features (columns) in your data.

# 4. **Example Reshaping**:
#    - If your original data (`X_train_placement`) has dimensions `(1000, 10)`, it means you have 1000 samples and each sample has 10 features.
#    - Reshaping it for LSTM might look like: `X_train_placement_reshaped = np.reshape(X_train_placement.values, (1000, 10, 1))`.
#      - Here, `10` becomes `timesteps`, and `1` becomes `input_dim`, assuming you are adding a new dimension for LSTM.

# ### Example:

# ```python
# import numpy as np

# # Example data shape
# # X_train_placement.shape = (1000, 10)  # 1000 samples, 10 features

# # Reshape for LSTM
# X_train_placement_reshaped = np.reshape(X_train_placement.values, (1000, 10, 1))
# # X_train_placement_reshaped.shape = (1000, 10, 1)  # 1000 samples, 10 timesteps, 1 feature per timestep
# ```

# ### Summary:

# - **Reshaping** for LSTM models ensures your data is formatted correctly `(batch_size, timesteps, input_dim)`.
# - `shape[0]` refers to the number of samples (rows), and `shape[1]` refers to the number of features (columns) in your original data.
# - The reshaping process adapts your data to fit the requirements of the LSTM layer, enabling it to effectively learn from sequential patterns in your data.

In [9]:
X_train_cgpa_reshaped.shape[1]

35

In [10]:
# # Example for prediction

# # Predict CGPA for a new student
# student_cgpa_data = [/* student data for CGPA prediction */]
# student_cgpa_input = np.array(student_cgpa_data).reshape((1, len(student_cgpa_data), 1))
# predicted_cgpa = model_cgpa.predict(student_cgpa_input)
# print('Predicted CGPA:', predicted_cgpa)

# # Predict Placement for a new student
# student_placement_data = [/* student data for Placement prediction */]
# student_placement_input = np.array(student_placement_data).reshape((1, len(student_placement_data), 1))
# predicted_placement = model_placement.predict(student_placement_input)
# print('Predicted Placement:', 'Placed' if predicted_placement > 0.5 else 'Not Placed')

In [11]:
def predict_student_cgpa(student_index):
# Select a student for prediction
#student_index = 0  # Change this index to select different students
    
    #individual std data
    student_data_a = X_cgpa.iloc[student_index]

    # reshape the data for lstm
    cgpa_input = np.array(student_data_a).reshape((1, len(student_data_a), 1))
    
    # Predict CGPA
    predicted_cgpa = model_cgpa.predict(cgpa_input)
    print('Predicted CGPA (original scale):', predicted_cgpa[0][0])


    #trial code to inverse transform
    #inverse transform
    
    y_cgpa = data['CGPA after 8th semester']
    y_train_cgpa = np.array(y_cgpa.values)
    scaler = StandardScaler()
    y_train_cgpa_scaled = scaler.fit_transform(y_train_cgpa.reshape(-1, 1)) #fit the data
    predicted_cgpa_original = scaler.inverse_transform(predicted_cgpa) #now do inverse transform
    print('Predicted CGPA (original scale):', predicted_cgpa_original[0][0])


def predict_student_placement(student_index):
    
    student_data_b = X_placement.iloc[student_index]
    
    #same for placemernt
    placement_input = np.array(student_data_b).reshape((1, len(student_data_b), 1))
    
    # Predict Placement
    predicted_placement = model_placement.predict(placement_input)
    print('Predicted Placement:', 'Placed' if predicted_placement[0][0] > 0.5 else 'Not Placed')

In [12]:
predict_student_cgpa(0)
predict_student_placement(0)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Predicted CGPA (original scale): -0.43841696
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
Predicted Placement: Placed


In [13]:
processed_data.iloc[0]['CGPA after 8th semester']

-0.7591614943875791

In [14]:
data.iloc[0]['Cam_plc']

'yes'