<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/RanFor_ExtInt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
# Re-download the dataset to ensure correct version
path = kagglehub.dataset_download("rakeshkapilavai/extrovert-vs-introvert-behavior-data", force=True)
print("Re-downloaded to:", path)

TypeError: dataset_download() got an unexpected keyword argument 'force'

In [3]:
# Step 2: Load CSV file
dataset_path = os.path.join(path, "personality_dataset.csv")
df = pd.read_csv(dataset_path)

In [18]:
import os

# Show all files in the downloaded dataset path
print("Files in dataset folder:\n", os.listdir(path))

Files in dataset folder:
 ['personality_dataset.csv']


In [4]:
# Step 3: Inspect data
print(df.head())
print("\nColumns:", df.columns)

   Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0               4.0         No                      4.0            6.0   
1               9.0        Yes                      0.0            0.0   
2               9.0        Yes                      1.0            2.0   
3               0.0         No                      6.0            7.0   
4               3.0         No                      9.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 13.0             5.0   Extrovert  
1                       Yes                  0.0             3.0   Introvert  
2                       Yes                  5.0             2.0   Introvert  
3                        No                 14.0             8.0   Extrovert  
4                        No                  8.0             5.0   Extrovert  

Columns: Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
   

In [16]:
# Investigate what's actually in the 'Personality' column
print("Raw 'Personality' values:\n", df['Personality'].unique())
print("\nSample values:\n", df['Personality'].dropna().unique()[:20])
print("\nValue counts:\n", df['Personality'].value_counts(dropna=False))

Raw 'Personality' values:
 []

Sample values:
 []

Value counts:
 Series([], Name: count, dtype: int64)


In [17]:
# Check the column names
print("Column names:\n", df.columns.tolist())

# Show first few rows of the dataset
print("\nFirst few rows:\n", df.head())

Column names:
 ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']

First few rows:
 Empty DataFrame
Columns: [Time_spent_Alone, Stage_fear, Social_event_attendance, Going_outside, Drained_after_socializing, Friends_circle_size, Post_frequency, Personality]
Index: []


In [12]:
# Step 4: Preprocess data

# Clean target column: lowercase, strip whitespace
df['Personality'] = df['Personality'].astype(str).str.strip().str.lower()

# Show unique values to debug
print("Unique values in 'Personality':", df['Personality'].unique())

# Keep only valid labels and map them
valid_labels = {'extrovert': 1, 'introvert': 0}
df = df[df['Personality'].isin(valid_labels.keys())]
df['Personality'] = df['Personality'].map(valid_labels)

# Drop rows with missing values in features
df = df.dropna(subset=['Personality'])

# Separate features and target
X = df.drop(columns=['Personality'])
y = df['Personality']

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, drop_first=True)

# Fill any remaining NaNs (from features)
X_encoded = X_encoded.fillna(X_encoded.median(numeric_only=True))

Unique values in 'Personality': []


In [14]:
print(f"Final dataset size: {len(df)} rows")
print(f"Label distribution:\n{y.value_counts()}")

Final dataset size: 0 rows
Label distribution:
Series([], Name: count, dtype: int64)


In [15]:
# Step 5: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [7]:
# Step 6: Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Yes'

In [None]:
# Step 7: Evaluate model
y_pred = rf.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
# Step 8: Plot Feature Importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
feature_importances_sorted = feature_importances.sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances_sorted, y=feature_importances_sorted.index, palette="viridis")
plt.title("Feature Importances (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()