# Student Cognitive Skills Analysis

This notebook contains:
1. Synthetic student dataset generation
2. Correlation analysis between cognitive skills and performance
3. ML model for assessment score prediction
4. Student clustering into learning personas


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Generate synthetic student data
n_students = 1000

# Generate random data for each feature
student_data = {
    'student_id': range(1, n_students + 1),
    'name': [f'Student_{i}' for i in range(1, n_students + 1)],
    'class': np.random.choice(['A', 'B', 'C', 'D'], n_students),
    'comprehension': np.random.normal(70, 15, n_students).clip(0, 100),
    'attention': np.random.normal(65, 20, n_students).clip(0, 100),
    'focus': np.random.normal(75, 10, n_students).clip(0, 100),
    'retention': np.random.normal(68, 18, n_students).clip(0, 100),
    'engagement_time': np.random.normal(45, 15, n_students).clip(0, 90)  # in minutes
}

# Calculate assessment score based on cognitive skills
weights = {
    'comprehension': 0.3,
    'attention': 0.2,
    'focus': 0.2,
    'retention': 0.2,
    'engagement_time': 0.1
}

# Normalize engagement_time to 0-100 scale for score calculation
engagement_normalized = (student_data['engagement_time'] - student_data['engagement_time'].min()) / \
                      (student_data['engagement_time'].max() - student_data['engagement_time'].min()) * 100

# Calculate weighted score with some random noise
base_score = (
    weights['comprehension'] * student_data['comprehension'] +
    weights['attention'] * student_data['attention'] +
    weights['focus'] * student_data['focus'] +
    weights['retention'] * student_data['retention'] +
    weights['engagement_time'] * engagement_normalized
)
student_data['assessment_score'] = (base_score + np.random.normal(0, 5, n_students)).clip(0, 100)

# Create DataFrame
df = pd.DataFrame(student_data)

# Save to CSV
df.to_csv('../data/student_data.csv', index=False)
print("Dataset shape:", df.shape)
df.head()


In [None]:
# Analyze correlations between cognitive skills and performance
correlation_matrix = df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix: Cognitive Skills vs Performance')
plt.show()

# Print key insights
print("\nKey Correlations with Assessment Score:")
for col in correlation_matrix.index[:-1]:
    corr = correlation_matrix.loc[col, 'assessment_score']
    print(f"{col}: {corr:.3f}")


In [None]:
# Build ML model to predict assessment scores
X = df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']]
y = df['assessment_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Print model performance
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

# Print feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.coef_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('Importance', ascending=False))


In [None]:
# Cluster students into learning personas
# Select features for clustering
X_cluster = df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']]
X_cluster_scaled = StandardScaler().fit_transform(X_cluster)

# Determine optimal number of clusters using elbow method
inertias = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

# Perform clustering with optimal k=4
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(X_cluster_scaled)

# Analyze cluster characteristics
cluster_means = df.groupby('cluster')[['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']].mean()
print("\nCluster Characteristics:")
print(cluster_means)

# Save processed data for dashboard
df.to_csv('../data/processed_student_data.csv', index=False)
