# Learn how to split data for training, validation, and testing.
[dataset](https://archive.ics.uci.edu/dataset/320/student+performance)

In [1]:
import pandas as pd

In [2]:
student_path = "../../data/as1/student-mat.csv"
df = pd.read_csv(student_path, delimiter=';') # read the csv. In our data, student-mat.csv was used semi-comma instead of comma
df # print df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


#### Our target col would be G3 - which is final grade of students

## Task 1: Perform random splitting.

import modulues, needed for working with model

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

#### Making label encoding, because there are some categorical data

In [4]:
categorical_columns = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 
                       'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

df[categorical_columns] = df[categorical_columns].apply(lambda col: LabelEncoder().fit_transform(col))
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,4,6,10,10


In [5]:
# Determine the counts of each class in the target variable
class_counts = df['G3'].value_counts()

# Filter out classes with fewer than 2 samples
valid_classes = class_counts[class_counts >= 2].index
df = df[df['G3'].isin(valid_classes)]

Split data, randomly - 80% trading, 20% testing

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns='G3'), df['G3'], test_size=0.2, random_state=42)
x_train.shape, x_test.shape

((314, 32), (79, 32))

## Task 2: Perform stratified sampling.

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns='G3'), df['G3'], test_size=0.2, random_state=42, stratify=df['G3']
)

x_train.shape, x_test.shape

((314, 32), (79, 32))

## Task 3: Apply cross-validation

# initialize clf

In [8]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)

In [9]:
x = df.drop(columns='G3')
y = df['G3']

In [10]:
scores = cross_val_score(clf, x, y, cv=5)

In [11]:
print("Cross-validation scores:", scores)
print("Mean cross-validation score:", scores.mean())

Cross-validation scores: [0.35443038 0.39240506 0.49367089 0.32051282 0.41025641]
Mean cross-validation score: 0.394255111976631
