In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import auc, precision_recall_curve, roc_curve
from sklearn.model_selection import learning_curve

In [2]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv', index_col=0)

In [3]:
X_train.head()

Unnamed: 0,year,burnable_frac,temperature,precipitation,dry_days,x,y,month_sin,month_cos,country_Azərbaycan,...,land_cover_130,land_cover_150,land_cover_153,land_cover_160,land_cover_180,land_cover_190,land_cover_200,land_cover_201,land_cover_210,land_cover_220
0,7,-5.573993,-1.342652,-0.314196,1.013571,0.287088,-0.910904,-2.449294e-16,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11,0.271694,-1.075527,-0.113051,-0.683109,-0.338279,-0.940919,1.0,6.123234000000001e-17,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.386167,-0.948995,2.003547,0.235239,-0.051925,0.010957,0.8660254,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14,-0.292112,0.274154,0.680102,-0.612044,-0.271643,-0.128709,1.224647e-16,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0.351268,0.105444,-0.412483,0.0,0.369707,0.152659,-1.0,-1.83697e-16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# specify the columns to convert
columns_to_convert = X_train.columns

# loop over the columns and convert them to more memory-efficient data types
for column in columns_to_convert:
    if pd.api.types.is_integer_dtype(X_train[column]):
        # check the maximum value in the column
        max_value = X_train[column].max()
        if max_value <= 2147483647:
            # convert the column to int32
            X_train[column] = X_train[column].astype('int32')
    elif pd.api.types.is_float_dtype(X_train[column]):
        # check the range and precision of the values in the column
        max_value = X_train[column].max()
        std_value = X_train[column].std()
        if max_value <= 3.4e38 and std_value >= 1.2e-7:
            # convert the column to float32
            X_train[column] = X_train[column].astype('float32')


: 

: 

In [None]:
# define your classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'SVM': SVC(probability=True)
}

# specify the chunk size
chunksize = 10 ** 6

# create stratified k-fold cross-validation object
cv = StratifiedKFold(n_splits=5)

# read and process the data in chunks
for X, y in zip(pd.read_csv('X_train.csv', chunksize=chunksize),
                pd.read_csv('y_train.csv', chunksize=chunksize)):
    
    # evaluate each classifier using cross-validation
    for name, clf in classifiers.items():
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)
        precisions = []
        recalls = []

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

        for i, (train_index, test_index) in enumerate(cv.split(X, y)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            clf.fit(X_train, y_train)
            probas_ = clf.predict_proba(X_test)

            # Compute ROC curve and area under the curve
            fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
            tprs.append(np.interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)

            ax1.plot(fpr, tpr, lw=1, alpha=0.3,
                     label=f'ROC fold {i} (AUC = {roc_auc:.2f})')

            # Compute precision-recall curve
            precision, recall, _ = precision_recall_curve(y_test,
                                                           probas_[:, 1])
            precisions.append(precision)
            recalls.append(recall)

            ax2.plot(recall, precision, lw=1, alpha=0.3,
                     label=f'Precision-Recall fold {i}')

        ax1.plot([0, 1], [0, 1], linestyle='--', lw=2,
                 label='Chance', alpha=.8)

        mean_tpr = np.mean(tprs,axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)

        ax1.plot(mean_fpr, mean_tpr,
                 label=f'Mean ROC (AUC = {mean_auc:.2f})',
                 lw=2)

        std_auc = np.std(aucs)

        ax1.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
                title=f'{name} ROC')
        ax1.legend(loc='lower right')

        mean_precision = np.mean(precisions,axis=0)
        mean_recall = np.mean(recalls,axis=0)

        ax2.plot(mean_recall[::-1], mean_precision[::-1],
                 label=f'Mean Precision-Recall',
                 lw=2)

        ax2.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
                title=f'{name} Precision-Recall')
        ax2.legend(loc='lower left')

        plt.show()

