In [1]:
#testing how to split data to favor correct responses

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("response_matrix.csv")
df

Unnamed: 0,user_id,problem_id,correct
0,70759,74567,0.0
1,70759,74564,0.0
2,70759,74557,1.0
3,70759,74571,1.0
4,70759,74553,1.0
...,...,...,...
344853,96282,93740,1.0
344854,96282,135605,1.0
344855,96282,135607,1.0
344856,96282,135601,1.0


In [3]:
#existing basic split method:
df_rand = df.sample(frac=1).reset_index(drop=True)
#Create quantile bins based on the user_id distribution
quantile_bins = pd.qcut(df_rand['user_id'].rank(method='first'), q=5, labels=False)

#Add the quantile bins as a new column to the DataFrame
df_rand['quantile_bin'] = quantile_bins

#Split the DataFrame into 5 parts based on the quantile bins
dfs = [df_rand[df_rand['quantile_bin'] == i].drop(columns='quantile_bin') for i in range(5)] #5 for 5x validation

for d in dfs:
    print(d['correct'].mean())
    print(len(d['user_id'].unique()))

0.6610472365597634
791
0.6451842078554755
316
0.625134112393435
1080
0.5984652970088878
1146
0.6080583425158035
957


In [4]:
from sklearn.model_selection import train_test_split

X = df[["user_id", "problem_id"]]
y = df["correct"]

users = df['user_id'].unique()

train = pd.DataFrame()
test = pd.DataFrame()

for uid in users:
    udf = df[df['user_id'] == uid].copy()
    X = udf[["user_id", "problem_id"]]
    y = udf["correct"]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 217)
    train = pd.concat([train, pd.concat([x_train, y_train], axis=1)])
    test = pd.concat([test, pd.concat([x_test, y_test], axis=1)])

# should be the same
print(len(train['user_id'].unique()))
print(len(test['user_id'].unique()))
print(len(df['user_id'].unique()))

4286
4286
4286


In [5]:
#average corrects (should be close)
print(train['correct'].mean())
print(test['correct'].mean())
print(df['correct'].mean())

0.6275521182855309
0.6276777881540595
0.6275778726316339


In [6]:
from sklearn.model_selection import KFold
#modify with k fold 
kf = KFold(n_splits=5, shuffle=True, random_state=217)

In [7]:
#chat gpt code to copy for the kfold split
from sklearn.model_selection import KFold

# Example feature matrix (X) and target vector (y)
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
y = [0, 1, 0, 1, 0, 1]

# Create a KFold object for 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the splits
for train_index, test_index in kf.split(X):
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    
    # Now you can train and evaluate your model
    print("Training feature matrix:", X_train)
    print("Testing feature matrix:", X_test)
    print("Training target vector:", y_train)
    print("Testing target vector:", y_test)


Training feature matrix: [[5, 6], [7, 8], [9, 10], [11, 12]]
Testing feature matrix: [[1, 2], [3, 4]]
Training target vector: [0, 1, 0, 1]
Testing target vector: [0, 1]
Training feature matrix: [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
Testing feature matrix: [[11, 12]]
Training target vector: [0, 1, 0, 1, 0]
Testing target vector: [1]
Training feature matrix: [[1, 2], [3, 4], [7, 8], [9, 10], [11, 12]]
Testing feature matrix: [[5, 6]]
Training target vector: [0, 1, 1, 0, 1]
Testing target vector: [0]
Training feature matrix: [[1, 2], [3, 4], [5, 6], [7, 8], [11, 12]]
Testing feature matrix: [[9, 10]]
Training target vector: [0, 1, 0, 1, 1]
Testing target vector: [0]
Training feature matrix: [[1, 2], [3, 4], [5, 6], [9, 10], [11, 12]]
Testing feature matrix: [[7, 8]]
Training target vector: [0, 1, 0, 0, 1]
Testing target vector: [1]


In [27]:
users = df['user_id'].unique()

n = 5

kf = KFold(n_splits=n, shuffle=True, random_state=217)

#train = pd.DataFrame()
#test = pd.DataFrame()

dfs = []

for i in range(n):
    dfs.append({
        "train":pd.DataFrame(),
        "test":pd.DataFrame()
    })

print(len(dfs))

for uid in users:
    #runs 4286 times...
    fold_index = 0
    #perform a KFold split on each individual users data
    udf = df[df['user_id'] == uid].copy()

    for train_index, test_index in kf.split(udf):
        train, test = udf.iloc[train_index], udf.iloc[test_index]
        #append this users training/testing data to the total collection
        dfs[fold_index]["train"] = pd.concat([dfs[fold_index]["train"], train], axis=0, ignore_index=True)
        dfs[fold_index]["test"] = pd.concat([dfs[fold_index]["test"], test], axis=0, ignore_index=True)
        fold_index += 1
               


5


In [28]:
for z in dfs:
    # should be the same!
    print(len(z["train"]['user_id'].unique()))
    print(len(z['test']["user_id"].unique()))
    print(len(df['user_id'].unique()))
    print(len(users))

4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286
4286


In [29]:
dfs

[{'train':         user_id  problem_id  correct
  0         70759       74557      1.0
  1         70759       74571      1.0
  2         70759       74553      1.0
  3         70759       57767      1.0
  4         70759       49064      0.0
  ...         ...         ...      ...
  274179    96282       93728      0.0
  274180    96282       93740      1.0
  274181    96282      135605      1.0
  274182    96282      135607      1.0
  274183    96282      135616      1.0
  
  [274184 rows x 3 columns],
  'test':        user_id  problem_id  correct
  0        70759       74567      0.0
  1        70759       74564      0.0
  2        70759       57875      0.0
  3        70759       37657      0.0
  4        70759       37671      0.0
  ...        ...         ...      ...
  70669    96282       87041      0.0
  70670    96282       88596      0.0
  70671    96282       58814      0.0
  70672    96282       93751      1.0
  70673    96282      135601      1.0
  
  [70674 rows x 3 column