In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from scikeras.wrappers import KerasClassifier
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS']='0'
df = pd.read_csv('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv')






In [2]:
def convert_salary(salary):
    salary = salary.replace('₹', '').replace('$', '').replace(',', '')
    salary = salary.replace('₹', '').replace('£', '').replace(',', '')
    salary = salary.replace('₹', '').replace('AFN', '').replace(',', '')
    if '/mo' in salary:
        return float(salary.replace('/mo', '')) * 12
    elif '/hr' in salary:
        return float(salary.replace('/hr', '')) * 40 * 52
    else:
        return float(salary.replace('/yr', ''))

df['Salary'] = df['Salary'].apply(convert_salary)

In [3]:
salary_bins = [0, 500000, 1000000, 1500000, 2000000, np.inf]
salary_labels = [0, 1, 2, 3, 4]  # Assign a label to each bin
df['SalaryBin'] = pd.cut(df['Salary'], bins=salary_bins, labels=salary_labels).astype(int)

categorical_cols = ['Company Name', 'Job Title', 'Location']
X = df[categorical_cols]
y = df['SalaryBin']

encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

scaler = StandardScaler()
X_encoded_scaled = scaler.fit_transform(X_encoded)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded_scaled, y)

ValueError: Cannot cast object dtype to int64

In [7]:

def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Input(shape=(X_resampled.shape[1],)))
    model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
    model.add(Dense(5, activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
model = KerasClassifier(model=create_model, verbose=0)
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'epochs': [50, 100],
    'batch_size': [64, 128]
}


In [10]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=4, cv=3, error_score='raise')
grid_result = grid.fit(X_resampled, y_resampled)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_model = grid_result.best_estimator_

# Evaluate the best model with cross-validation
scores = cross_val_score(best_model, X_resampled, y_resampled, cv=5)
print("Accuracy: %.2f%% (+/- %.2f%%)" % (scores.mean() * 100, scores.std() * 100))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)
history = best_model.fit(X_train, y_train)

# Use predict_proba if available, otherwise use predict
try:
    y_pred_prob = best_model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
except AttributeError:
    y_pred = best_model.predict(X_test)

# Compute confusion matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(cm)
print(f"Accuracy: {accuracy:.2f}")


Best: 0.600000 using {'batch_size': 128, 'epochs': 100, 'optimizer': 'adam'}
Accuracy: 61.22% (+/- 8.57%)
Confusion Matrix:
[[195  92  62  13   8]
 [111 134 131  22   8]
 [ 33  38 282  12  12]
 [  7  13  12 350   5]
 [  4  11   6  11 333]]
Accuracy: 0.68


In [11]:
import pickle

In [13]:
model_filename = 'model1'
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle

# Load your data
df = pd.read_csv('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Dataset/Salary Dataset.csv')

def convert_salary(salary):
    salary = salary.replace('₹', '').replace('$', '').replace(',', '')
    salary = salary.replace('₹', '').replace('£', '').replace(',', '')
    salary = salary.replace('₹', '').replace('AFN', '').replace(',', '')
    if '/mo' in salary:
        return float(salary.replace('/mo', '')) * 12
    elif '/hr' in salary:
        return float(salary.replace('/hr', '')) * 40 * 52
    else:
        return float(salary.replace('/yr', ''))

df['Salary'] = df['Salary'].apply(convert_salary)

salary_bins = [0, 500000, 1000000, 1500000, 2000000, np.inf]
salary_labels = [0, 1, 2, 3, 4]
df['SalaryBin'] = pd.cut(df['Salary'], bins=salary_bins, labels=salary_labels).astype(int)

categorical_cols = ['Company Name', 'Job Title', 'Location']
X = df[categorical_cols]
y = df['SalaryBin']

# Create and fit the encoder
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

# Create and fit the scaler
scaler = StandardScaler()
X_encoded_scaled = scaler.fit_transform(X_encoded)

# Save the encoder
with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

# Save the scaler
with open('/Users/akshay/Desktop/ML-Crate/DataAnalyticsSalaryPrediction/Model/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
