In [5]:
#libraries for getting data from google form
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials

#ml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

getting data from google form

In [None]:
# Define the scope and credentials to access the Google Sheet
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('your_credentials_file.json', scope)
client = gspread.authorize(creds)

In [None]:
# Open the Google Form responses sheet
sheet = client.open('your_google_form_name').sheet1

In [None]:
# Get all the data from the sheet
data = sheet.get_all_records()

In [None]:
# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data)

In [None]:
# Export the data to a CSV file
df.to_csv('google_form_data.csv', index=False)

preprocess

In [None]:
#duplicates

In [None]:
X = data.drop(['Type', 'Job'], axis=1)
y = data['Job']

In [None]:
#null values

In [None]:
#formating the columns

visualization

In [None]:
#personality types distribution

In [None]:
#ml

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define pipelines for each classifier
pipelines = {
    'Logistic Regression': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]),
    'Decision Tree': Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())]),
    'Random Forest': Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier())]),
    'Gradient Boosting': Pipeline([('scaler', StandardScaler()), ('clf', GradientBoostingClassifier())]),
    'SVM': Pipeline([('scaler', StandardScaler()), ('clf', SVC())]),
    'Neural Network': Pipeline([('scaler', StandardScaler()), ('clf', MLPClassifier())])
}

In [None]:
hyperparameters = {
    'Logistic Regression': {'clf__C': [0.01, 0.1, 1, 10]},
    'Decision Tree': {'clf__max_depth': [2, 4, 6, 8]},
    'Random Forest': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [2, 4, 6, 8]},
    'Gradient Boosting': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [2, 4, 6, 8]},
    'SVM': {'clf__C': [0.01, 0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']},
    'Neural Network': {'clf__hidden_layer_sizes': [(10,), (20,), (30,)], 'clf__alpha': [0.0001, 0.001, 0.01]}
}

In [None]:
# Perform grid search and train models
models = {}
for name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, hyperparameters[name], cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    models[name] = grid_search.best_estimator_
    print(f'{name}: {grid_search.best_score_:.3f} +/- {grid_search.cv_results_["std_test_score"][grid_search.best_index_]:.3f}')

In [None]:
# Make predictions on new example
def predict_job(personality_traits):
    personality_traits = pd.Series(personality_traits).values.reshape(1, -1)
    predictions = {}
    for name, model in models.items():
        predictions[name] = model.predict(personality_traits)[0]
    return predictions