Block 1: Load the Data & Basic Preprocessing

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
df = pd.read_csv("fake_social_personality_data_2000.csv")
print(df.head())

# Encode categorical features
df['top_word_category_encoded'] = LabelEncoder().fit_transform(df['top_word_category'])
df['personality_encoded'] = LabelEncoder().fit_transform(df['personality'])
print(df.head())

# Define features and target
X = df[['posts_per_day', 'avg_post_length', 'num_hashtags', 'emoji_usage_percent',
        'avg_post_hour', 'top_word_category_encoded']]
y = df['personality_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

  username  posts_per_day  avg_post_length  num_hashtags  emoji_usage_percent  \
0   user_0              9              285             2                43.64   
1   user_1              7              147             8                39.85   
2   user_2              2              231             5                25.75   
3   user_3              6              282             0                 6.54   
4   user_4              4               99             2                10.75   

   avg_post_hour top_word_category personality  
0              8       informative   Introvert  
1             19       informative   Introvert  
2             21           neutral   Extrovert  
3              5           neutral   Introvert  
4             10           neutral   Introvert  
  username  posts_per_day  avg_post_length  num_hashtags  emoji_usage_percent  \
0   user_0              9              285             2                43.64   
1   user_1              7              147             8 

Block 2: Build Pipeline and Run GridSearch

In [9]:
# Define a pipeline with StandardScaler and KNN
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Parameter grid for GridSearchCV
param_grid = {
    'knn__n_neighbors': list(range(3, 21)),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Show the best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 4, 'knn__weights': 'uniform'}


 Block 3: Evaluate the Final Model

In [10]:
# Get the best model from GridSearch
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Show classification performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.80      0.64       210
           1       0.52      0.24      0.33       190

    accuracy                           0.54       400
   macro avg       0.53      0.52      0.49       400
weighted avg       0.53      0.54      0.50       400

