<a href="https://colab.research.google.com/github/amien1410/colab-notebooks/blob/main/Colab_Pyspark_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Install Kaggle modules and download the dataset

from google.colab import drive
drive.mount('/content/drive')

!pip install kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/kaggle'
!kaggle competitions download -c predict-student-performance-from-game-play
!unzip -q "/content/predict-student-performance-from-game-play.zip"

Mounted at /content/drive
Downloading predict-student-performance-from-game-play.zip to /content
 99% 960M/968M [00:11<00:00, 117MB/s] 
100% 968M/968M [00:11<00:00, 89.0MB/s]


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, split, struct
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler

from pyspark.sql.functions import *
from pyspark.sql.types import *


from xgboost import XGBClassifier
import xgboost as xgb


import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings("ignore")

In [5]:
# Create a SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

# Define schema for train.csv
train_schema = StructType([
    StructField("session_id", LongType(), True),
    StructField("index", ShortType(), True),
    StructField("elapsed_time", IntegerType(), True),
    StructField("event_name", StringType(), True),
    StructField("name", StringType(), True),
    StructField("level", ByteType(), True),
    StructField("page", IntegerType(), True),
    StructField("room_coor_x", DoubleType(), True),
    StructField("room_coor_y", DoubleType(), True),
    StructField("screen_coor_x", DoubleType(), True),
    StructField("screen_coor_y", DoubleType(), True),
    StructField("hover_duration", DoubleType(), True),
    StructField("text", StringType(), True),
    StructField("fqid", StringType(), True),
    StructField("room_fqid", StringType(), True),
    StructField("text_fqid", StringType(), True),
    StructField("fullscreen", ByteType(), True),
    StructField("hq", ByteType(), True),
    StructField("music", ByteType(), True),
    StructField("level_group", StringType(), True)
])

# Load train.csv
df_train = spark.read.format("csv").option("header", True).schema(train_schema).load("/content/train.csv")
print ('**** output: df_train ****')

**** output: df_train ****


In [6]:
def data_transformation(df):
    # Select the numeric columns of the dataframe, excluding "page"
    numeric_cols = [c for c, dtype in df.dtypes if dtype == 'double' and c != 'page']

    # Calculate the mean of the values in each numeric column, except "page"
    means = df.agg(*(mean(c).alias(c) for c in numeric_cols)).first().asDict()

    # Define the fill value for the "page" column as zero
    fill_values = means
    fill_values['page'] = 0

    # Fill missing values with fill values
    filled = df.fillna(fill_values)

    # Fill missing data in hover_duration column with 0
    filled = filled.fillna(0, subset=['hover_duration'])

    # Drop rows with missing values in "level_group" column
    filled = filled.na.drop(subset=["level_group"])

    # Check categorical columns and fill with 0
    categorical_cols = [c for c, dtype in df.dtypes if dtype == 'string']
    filled = filled.fillna('0', subset=categorical_cols)

    # Transform categorical data into numerical data using StringIndexer
    indexer = StringIndexer(inputCols=['event_name', 'name', 'fqid', 'text', 'room_fqid', 'text_fqid'],
                            outputCols=['event_name_idx', 'name_idx', 'fqid_idx', 'text_idx', 'room_fqid_idx', 'text_fqid_idx'])
    indexed = indexer.fit(filled).transform(filled)

    categorical_cols = [c for c, dtype in indexed.dtypes if dtype == 'string' and c != 'level_group']
    all_cols = indexed.columns
    numeric_cols = [col_name for col_name in all_cols if col_name not in categorical_cols]
    df_numeric = indexed.select(numeric_cols)

    # Group the data by the specified column combination and calculate the mean of the remaining numeric columns
    cols_for_group = ['elapsed_time', 'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x',
                      'screen_coor_y', 'hover_duration', 'fullscreen', 'hq', 'music','event_name_idx', 'name_idx',
                      'fqid_idx', 'text_idx', 'room_fqid_idx', 'text_fqid_idx']
    grouped = df_numeric.groupby(['session_id', 'level_group']).agg(*(avg(col(c)).alias(c) for c in cols_for_group))

    # Define a mapping of old column names to new ones
    column_mapping = {
        'event_name_idx': 'event_name',
        'name_idx': 'name',
        'text_idx': 'text',
        'fqid_idx': 'fqid',
        'room_fqid_idx': 'room_fqid',
        'text_fqid_idx': 'text_fqid',
        'fullscreen': 'fullscreen',
    }

    # Rename columns using the mapping defined above
    grouped = grouped.toDF(*[column_mapping.get(col, col) for col in grouped.columns])

    # Transform pyspark dataframe to pandas datraframe
    df_pandas = grouped.toPandas()
    print('************** Done!!! ***************')

    return df_pandas

dftrain = data_transformation(df_train)

************** Done!!! ***************


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder



def data_transformation_test(df):
    # Seleciona as colunas numéricas do dataframe, excluindo "page"
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    numeric_cols = numeric_cols[2:]
    numeric_cols.remove('page')


    # Preenche os valores faltantes com a média
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    # Preenche os valores faltantes em hover_duration com 0
    df['hover_duration'] = df['hover_duration'].fillna(0)
    df['page'] = df['page'].fillna(0)

    # Seleciona as colunas categóricas do dataframe
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Preenche os valores faltantes em cada coluna categórica com "0"
    df[categorical_cols] = df[categorical_cols].fillna('0')

    # Transforma dados categóricos em dados numéricos usando LabelEncoder
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

    df = df.drop(columns=['index'])
    #df = df.set_index('session_id')

    """
    Cria uma nova coluna "level_group" com base na coluna "question" do DataFrame df.
    """
    # Define as condições para a criação da nova coluna "level_group"
    conditions = [
        (df["level_group"] == 0),
        (df["level_group"] == 1),
        (df["level_group"] == 2)
    ]

    # Define os valores a serem atribuídos à nova coluna "level_group" de acordo com as condições
    values = ["0-4", "5-12", "13-22"]

    # Cria a nova coluna "level_group" with correct data type
    df["level_group"] = np.select(conditions, values, default=df["level_group"])

#     # Criar um dicionário que mapeia os valores de "level_group" para seus respectivos valores numéricos
#     level_group_dict = {'0-4': 0, '5-12': 1, '13-22': 2}
#     df['level_group'] = df['level_group'].apply(lambda x: level_group_dict.get(x, 0))

    return df

df_test  = pd.read_csv("/content/test.csv")
test_df = data_transformation_test(df_test)
print ('******* output: test_df ********')

******* output: test_df ********


In [11]:
#Define schema for train_labels.csv
train_labels_schema = {"session_id": str, "target": int}

#Load train_labels.csv into a Pandas DataFrame
df_train_labels = pd.read_csv("/content/train_labels.csv", dtype=train_labels_schema)

#Extract session and question from session_id
df_train_labels[["session_id", "level"]] = df_train_labels["session_id"].str.split("_", expand=True)
df_train_labels["session_id"] = df_train_labels["session_id"].str.extract('(\d+)').astype(int)
df_train_labels["level"] = df_train_labels["level"].str.extract('(\d+)').astype(int)

In [13]:
def create_level_group_column(df):
    """
    Cria a nova coluna "level_group" com base na coluna "level" do DataFrame df.
    """
    # Define as condições para a criação da nova coluna "level_group"
    conditions = [
        (df["level"] <= 3),
        (df["level"] <= 13),
        (df["level"] <= 22)
    ]

    # Define os valores a serem atribuídos à nova coluna "level_group" de acordo com as condições
    values = ["0-4", "5-12", "13-22"]

    # Cria a nova coluna "level_group"
    df["level_group"] = np.select(conditions, values, default='Unknown')
    # Criar um dicionário que mapeia os valores de "level_group" para seus respectivos valores numéricos
    level_group_dict = {'0-4': 1, '5-12': 2, '13-22': 3}
    df['session_level'] = df['level_group'].apply(lambda x: level_group_dict.get(x, 0))

    # Retorna o DataFrame com a nova coluna "level_group"
    return df
# Cria a nova coluna "level_group"
target = create_level_group_column(df_train_labels)
print ('******* output: target ********')

******* output: target ********


In [16]:
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer

f1_scores = []  # List to store individual F1-scores

# Define the feature columns and target column
FEATURES = [c for c in df_train.columns if c not in ["level_group", "correct"]]
target_column = "correct"

# Iterate over questions from 1 to 18
for i in range(1, 19):
    print(f'Question {i}:')
    target_column = 'correct'

    # Select the dataset for the current question
    current_question = df_train[df_train['level'] == i]

    # Split the dataset into training and testing
    X_train, X_test, y_train, y_test = train_test_split(current_question[FEATURES],
                                                        current_question[target_column],
                                                        test_size=0.3,
                                                        random_state=42)

    # Impute missing values in X_train and X_test
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Create and train the SVM model with desired parameters
    model = SVC(kernel='rbf', C=1.0, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate the F1-score
    f1 = f1_score(y_test, y_pred)
    f1_scores.append(f1)  # Store the F1-score

    print(f'F1-score: {f1}')

# Calculate the average F1-score
average_f1 = np.mean(f1_scores)
print(f'Average F1-score: {average_f1}')

Question 1:
F1-score: 0.8450290008986194
Question 2:
F1-score: 0.989493245657923
Question 3:
F1-score: 0.9659913698529949
Question 4:
F1-score: 0.8888714240804778
Question 5:
F1-score: 0.7011483693155719
Question 6:
F1-score: 0.8773127928214087
Question 7:
F1-score: 0.8498210218027986
Question 8:
F1-score: 0.7605856053300605
Question 9:
F1-score: 0.8436119744806151
Question 10:
F1-score: 0.6620611337181792
Question 11:
F1-score: 0.7839325649406502
Question 12:
F1-score: 0.926418103120966
Question 13:
F1-score: 0.0
Question 14:
F1-score: 0.8289571771722024
Question 15:
F1-score: 0.0
Question 16:
F1-score: 0.8475709162047603
Question 17:
F1-score: 0.8124317513649727
Question 18:
F1-score: 0.9764714399478752
Average F1-score: 0.7533171050394487


In [17]:
# Create and train the SVM model
clf = model
clf.fit(X_train, y_train)

# Make predictions
y_pred_scores = clf.decision_function(X_test)

# Find the best threshold
best_threshold = None
best_f1_score = 0.0

for threshold in np.arange(-1.0, 1.1, 0.1):
    y_pred = (y_pred_scores > threshold).astype(int)
    f1 = f1_score(y_test, y_pred)

    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Best F1-score:", best_f1_score)

Best Threshold: -1.0
Best F1-score: 0.9764714399478752
