# Import packages

In [25]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
import zipfile

# Get the data

In [26]:
with zipfile.ZipFile("../data/playground-series-s4e7.zip") as z:
    with z.open("sample_submission.csv") as f:
        sample_submission = pd.read_csv(f)
    with z.open("test.csv") as f:
        test = pd.read_csv(f)
    with z.open("train.csv") as f:
        train = pd.read_csv(f)

# Explore the data

In [27]:
train.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Non-Null Count     Dtype  
---  ------                --------------     -----  
 0   id                    11504798 non-null  int64  
 1   Gender                11504798 non-null  object 
 2   Age                   11504798 non-null  int64  
 3   Driving_License       11504798 non-null  int64  
 4   Region_Code           11504798 non-null  float64
 5   Previously_Insured    11504798 non-null  int64  
 6   Vehicle_Age           11504798 non-null  object 
 7   Vehicle_Damage        11504798 non-null  object 
 8   Annual_Premium        11504798 non-null  float64
 9   Policy_Sales_Channel  11504798 non-null  float64
 10  Vintage               11504798 non-null  int64  
 11  Response              11504798 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [28]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669866 entries, 0 to 7669865
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 643.7+ MB


In [29]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7669866 entries, 0 to 7669865
Data columns (total 2 columns):
 #   Column    Dtype  
---  ------    -----  
 0   id        int64  
 1   Response  float64
dtypes: float64(1), int64(1)
memory usage: 117.0 MB


In [30]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
Gender,Male,Male,Female,Female,Female
Age,21,43,25,35,36
Driving_License,1,1,1,1,1
Region_Code,35.0,28.0,14.0,1.0,15.0
Previously_Insured,0,0,1,0,1
Vehicle_Age,1-2 Year,> 2 Years,< 1 Year,1-2 Year,1-2 Year
Vehicle_Damage,Yes,Yes,No,Yes,No
Annual_Premium,65101.0,58911.0,38043.0,2630.0,31951.0
Policy_Sales_Channel,124.0,26.0,152.0,156.0,152.0


In [31]:
train.describe(include="all")

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,11504800.0,11504798,11504800.0,11504800.0,11504800.0,11504800.0,11504798,11504798,11504800.0,11504800.0,11504800.0,11504800.0
unique,,2,,,,,3,2,,,,
top,,Male,,,,,1-2 Year,Yes,,,,
freq,,6228134,,,,,5982678,5783229,,,,
mean,5752398.0,,38.38356,0.998022,26.41869,0.4629966,,,30461.37,112.4254,163.8977,0.1229973
std,3321149.0,,14.99346,0.0444312,12.99159,0.4986289,,,16454.75,54.03571,79.97953,0.3284341
min,0.0,,20.0,0.0,0.0,0.0,,,2630.0,1.0,10.0,0.0
25%,2876199.0,,24.0,1.0,15.0,0.0,,,25277.0,29.0,99.0,0.0
50%,5752398.0,,36.0,1.0,28.0,0.0,,,31824.0,151.0,166.0,0.0
75%,8628598.0,,49.0,1.0,35.0,1.0,,,39451.0,152.0,232.0,0.0


In [32]:
categorical_variables = [
    "Gender",
    "Vehicle_Age",
    "Vehicle_Damage",
    "Driving_License",
    "Previously_Insured",
]

numerical_variables = [
    col
    for col in list(train.columns)
    if col not in categorical_variables + ["Response", "id"]
]

In [33]:
numerical_variables

['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Split the data

In [34]:
train_train, train_test = train_test_split(train, test_size=0.2, random_state=42)