# BC Dataton

In [12]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns



# Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#modeling

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

# Feature engineerin and Dimension reduction
from sklearn.feature_selection import VarianceThreshold


# Utils
from tqdm import tqdm
from pandas_profiling import ProfileReport
from pycaret.regression import *


%matplotlib inline

sns.set_style("white")
matplotlib.rc('xtick', labelsize=15)
matplotlib.rc('ytick', labelsize=15)
plt.rcParams['figure.figsize'] = [16.0, 10.0]

# Data Processing class

In [5]:
class DataFrameBuilder:
    
    def __init__(self, dataframe):
        self.original_dataframe = dataframe
        
    def handle_missing_data(self):
        pass
    
    def process_columns(self):
        pass
    
    def remove_columns(self):
        pass
    
    def create_columns(self):
        pass
    
    def build(self):
        pass
        

In [2]:
data = pd.read_csv("../data/train_201902.csv", error_bad_lines=False)

b'Skipping line 240: expected 65 fields, saw 67\nSkipping line 478: expected 65 fields, saw 67\nSkipping line 943: expected 65 fields, saw 67\nSkipping line 1367: expected 65 fields, saw 67\nSkipping line 1672: expected 65 fields, saw 67\nSkipping line 1919: expected 65 fields, saw 66\nSkipping line 2044: expected 65 fields, saw 67\nSkipping line 2250: expected 65 fields, saw 67\nSkipping line 2623: expected 65 fields, saw 66\nSkipping line 2727: expected 65 fields, saw 66\nSkipping line 2918: expected 65 fields, saw 66\nSkipping line 3476: expected 65 fields, saw 66\nSkipping line 3494: expected 65 fields, saw 66\nSkipping line 3585: expected 65 fields, saw 67\nSkipping line 3682: expected 65 fields, saw 67\nSkipping line 4339: expected 65 fields, saw 67\nSkipping line 4641: expected 65 fields, saw 67\nSkipping line 5273: expected 65 fields, saw 67\nSkipping line 5853: expected 65 fields, saw 67\nSkipping line 6107: expected 65 fields, saw 66\nSkipping line 6426: expected 65 fields, s

b'Skipping line 49344: expected 65 fields, saw 67\nSkipping line 49359: expected 65 fields, saw 67\nSkipping line 49410: expected 65 fields, saw 67\nSkipping line 49635: expected 65 fields, saw 66\nSkipping line 49768: expected 65 fields, saw 66\nSkipping line 50272: expected 65 fields, saw 66\nSkipping line 50427: expected 65 fields, saw 66\nSkipping line 50767: expected 65 fields, saw 67\nSkipping line 51112: expected 65 fields, saw 66\nSkipping line 51283: expected 65 fields, saw 66\nSkipping line 51304: expected 65 fields, saw 66\nSkipping line 51424: expected 65 fields, saw 66\nSkipping line 51425: expected 65 fields, saw 66\nSkipping line 51578: expected 65 fields, saw 67\nSkipping line 51896: expected 65 fields, saw 67\nSkipping line 51973: expected 65 fields, saw 67\nSkipping line 52133: expected 65 fields, saw 66\nSkipping line 52146: expected 65 fields, saw 66\nSkipping line 52396: expected 65 fields, saw 67\nSkipping line 52898: expected 65 fields, saw 66\nSkipping line 5299

b'Skipping line 98815: expected 65 fields, saw 66\nSkipping line 98873: expected 65 fields, saw 67\nSkipping line 98975: expected 65 fields, saw 66\nSkipping line 99011: expected 65 fields, saw 67\nSkipping line 99099: expected 65 fields, saw 67\nSkipping line 99182: expected 65 fields, saw 66\nSkipping line 99319: expected 65 fields, saw 66\nSkipping line 99374: expected 65 fields, saw 66\nSkipping line 99836: expected 65 fields, saw 67\nSkipping line 99902: expected 65 fields, saw 66\nSkipping line 99956: expected 65 fields, saw 66\nSkipping line 99993: expected 65 fields, saw 67\nSkipping line 100282: expected 65 fields, saw 67\nSkipping line 100962: expected 65 fields, saw 67\nSkipping line 101521: expected 65 fields, saw 66\nSkipping line 101767: expected 65 fields, saw 66\nSkipping line 102074: expected 65 fields, saw 66\nSkipping line 102216: expected 65 fields, saw 67\nSkipping line 102528: expected 65 fields, saw 66\nSkipping line 102545: expected 65 fields, saw 67\nSkipping l

b'Skipping line 148274: expected 65 fields, saw 67\nSkipping line 148359: expected 65 fields, saw 66\nSkipping line 148418: expected 65 fields, saw 66\nSkipping line 148617: expected 65 fields, saw 66\nSkipping line 148620: expected 65 fields, saw 66\nSkipping line 148785: expected 65 fields, saw 66\nSkipping line 148990: expected 65 fields, saw 67\nSkipping line 149261: expected 65 fields, saw 67\nSkipping line 149625: expected 65 fields, saw 66\nSkipping line 149636: expected 65 fields, saw 66\nSkipping line 149809: expected 65 fields, saw 66\nSkipping line 149893: expected 65 fields, saw 66\nSkipping line 150059: expected 65 fields, saw 66\nSkipping line 150319: expected 65 fields, saw 66\nSkipping line 150433: expected 65 fields, saw 67\nSkipping line 150452: expected 65 fields, saw 67\nSkipping line 150576: expected 65 fields, saw 66\nSkipping line 150732: expected 65 fields, saw 66\nSkipping line 150975: expected 65 fields, saw 66\nSkipping line 151038: expected 65 fields, saw 66

b'Skipping line 197536: expected 65 fields, saw 66\nSkipping line 197583: expected 65 fields, saw 66\nSkipping line 197584: expected 65 fields, saw 66\nSkipping line 197601: expected 65 fields, saw 66\nSkipping line 197801: expected 65 fields, saw 66\nSkipping line 198167: expected 65 fields, saw 66\nSkipping line 198226: expected 65 fields, saw 66\nSkipping line 198336: expected 65 fields, saw 66\nSkipping line 198338: expected 65 fields, saw 67\nSkipping line 198353: expected 65 fields, saw 66\nSkipping line 198536: expected 65 fields, saw 66\nSkipping line 199280: expected 65 fields, saw 66\nSkipping line 199420: expected 65 fields, saw 66\nSkipping line 199834: expected 65 fields, saw 66\nSkipping line 199975: expected 65 fields, saw 66\nSkipping line 200051: expected 65 fields, saw 67\nSkipping line 200108: expected 65 fields, saw 67\nSkipping line 200256: expected 65 fields, saw 66\nSkipping line 200288: expected 65 fields, saw 67\nSkipping line 200297: expected 65 fields, saw 66

b'Skipping line 247009: expected 65 fields, saw 67\nSkipping line 247019: expected 65 fields, saw 66\nSkipping line 247058: expected 65 fields, saw 67\nSkipping line 247061: expected 65 fields, saw 66\nSkipping line 247384: expected 65 fields, saw 67\nSkipping line 247570: expected 65 fields, saw 67\nSkipping line 247681: expected 65 fields, saw 67\nSkipping line 247727: expected 65 fields, saw 67\nSkipping line 248058: expected 65 fields, saw 67\nSkipping line 248164: expected 65 fields, saw 67\nSkipping line 248363: expected 65 fields, saw 66\nSkipping line 248488: expected 65 fields, saw 67\nSkipping line 248634: expected 65 fields, saw 67\nSkipping line 249018: expected 65 fields, saw 66\nSkipping line 249411: expected 65 fields, saw 66\nSkipping line 249985: expected 65 fields, saw 66\nSkipping line 250936: expected 65 fields, saw 66\nSkipping line 251180: expected 65 fields, saw 67\nSkipping line 251283: expected 65 fields, saw 67\nSkipping line 251335: expected 65 fields, saw 66

b'Skipping line 296481: expected 65 fields, saw 66\nSkipping line 296554: expected 65 fields, saw 67\nSkipping line 296845: expected 65 fields, saw 66\nSkipping line 297565: expected 65 fields, saw 66\nSkipping line 298053: expected 65 fields, saw 66\nSkipping line 298305: expected 65 fields, saw 67\nSkipping line 299022: expected 65 fields, saw 67\nSkipping line 299041: expected 65 fields, saw 66\nSkipping line 299218: expected 65 fields, saw 67\nSkipping line 299749: expected 65 fields, saw 67\nSkipping line 299765: expected 65 fields, saw 67\nSkipping line 299930: expected 65 fields, saw 66\nSkipping line 300537: expected 65 fields, saw 66\nSkipping line 300588: expected 65 fields, saw 66\nSkipping line 300848: expected 65 fields, saw 66\nSkipping line 301044: expected 65 fields, saw 66\nSkipping line 301088: expected 65 fields, saw 67\nSkipping line 301182: expected 65 fields, saw 67\nSkipping line 301742: expected 65 fields, saw 67\nSkipping line 301974: expected 65 fields, saw 67

b'Skipping line 345636: expected 65 fields, saw 67\nSkipping line 345706: expected 65 fields, saw 66\nSkipping line 345794: expected 65 fields, saw 67\nSkipping line 346634: expected 65 fields, saw 66\nSkipping line 346639: expected 65 fields, saw 66\nSkipping line 347073: expected 65 fields, saw 67\nSkipping line 347094: expected 65 fields, saw 66\nSkipping line 347345: expected 65 fields, saw 67\nSkipping line 347710: expected 65 fields, saw 67\nSkipping line 347713: expected 65 fields, saw 67\nSkipping line 347784: expected 65 fields, saw 67\nSkipping line 348421: expected 65 fields, saw 66\nSkipping line 348563: expected 65 fields, saw 66\nSkipping line 348711: expected 65 fields, saw 67\nSkipping line 348725: expected 65 fields, saw 66\nSkipping line 349276: expected 65 fields, saw 67\nSkipping line 349426: expected 65 fields, saw 67\nSkipping line 349806: expected 65 fields, saw 67\nSkipping line 349891: expected 65 fields, saw 66\nSkipping line 350000: expected 65 fields, saw 67

b'Skipping line 386768: expected 65 fields, saw 66\nSkipping line 386927: expected 65 fields, saw 66\nSkipping line 387330: expected 65 fields, saw 67\nSkipping line 387614: expected 65 fields, saw 67\nSkipping line 387638: expected 65 fields, saw 67\nSkipping line 387783: expected 65 fields, saw 67\nSkipping line 387819: expected 65 fields, saw 67\nSkipping line 387853: expected 65 fields, saw 66\nSkipping line 387916: expected 65 fields, saw 66\nSkipping line 388009: expected 65 fields, saw 66\nSkipping line 388257: expected 65 fields, saw 66\nSkipping line 388730: expected 65 fields, saw 67\nSkipping line 388868: expected 65 fields, saw 66\nSkipping line 388894: expected 65 fields, saw 66\nSkipping line 389091: expected 65 fields, saw 66\nSkipping line 389311: expected 65 fields, saw 66\nSkipping line 389672: expected 65 fields, saw 67\nSkipping line 390809: expected 65 fields, saw 66\nSkipping line 390907: expected 65 fields, saw 66\nSkipping line 391121: expected 65 fields, saw 66

b'Skipping line 436134: expected 65 fields, saw 67\nSkipping line 436191: expected 65 fields, saw 67\nSkipping line 436447: expected 65 fields, saw 67\nSkipping line 436720: expected 65 fields, saw 67\nSkipping line 437064: expected 65 fields, saw 67\nSkipping line 437164: expected 65 fields, saw 67\nSkipping line 437231: expected 65 fields, saw 67\nSkipping line 437494: expected 65 fields, saw 67\nSkipping line 437545: expected 65 fields, saw 66\nSkipping line 437741: expected 65 fields, saw 66\nSkipping line 438410: expected 65 fields, saw 67\nSkipping line 438419: expected 65 fields, saw 67\nSkipping line 438714: expected 65 fields, saw 67\nSkipping line 438790: expected 65 fields, saw 66\nSkipping line 439128: expected 65 fields, saw 66\nSkipping line 439464: expected 65 fields, saw 67\nSkipping line 439807: expected 65 fields, saw 67\nSkipping line 439817: expected 65 fields, saw 66\nSkipping line 439986: expected 65 fields, saw 66\nSkipping line 440208: expected 65 fields, saw 66

b'Skipping line 485981: expected 65 fields, saw 66\nSkipping line 486112: expected 65 fields, saw 66\nSkipping line 486209: expected 65 fields, saw 66\nSkipping line 486283: expected 65 fields, saw 66\nSkipping line 486321: expected 65 fields, saw 67\nSkipping line 486347: expected 65 fields, saw 66\nSkipping line 486495: expected 65 fields, saw 67\nSkipping line 486580: expected 65 fields, saw 66\nSkipping line 486592: expected 65 fields, saw 66\nSkipping line 486763: expected 65 fields, saw 66\nSkipping line 487148: expected 65 fields, saw 67\nSkipping line 487411: expected 65 fields, saw 67\nSkipping line 487574: expected 65 fields, saw 67\nSkipping line 487600: expected 65 fields, saw 67\nSkipping line 488080: expected 65 fields, saw 67\nSkipping line 488774: expected 65 fields, saw 67\nSkipping line 489744: expected 65 fields, saw 67\nSkipping line 489770: expected 65 fields, saw 67\nSkipping line 489856: expected 65 fields, saw 67\nSkipping line 490098: expected 65 fields, saw 66

b'Skipping line 526587: expected 65 fields, saw 67\nSkipping line 527081: expected 65 fields, saw 66\nSkipping line 527426: expected 65 fields, saw 67\nSkipping line 527652: expected 65 fields, saw 66\nSkipping line 527670: expected 65 fields, saw 67\nSkipping line 527884: expected 65 fields, saw 67\nSkipping line 527941: expected 65 fields, saw 66\nSkipping line 527945: expected 65 fields, saw 66\nSkipping line 528181: expected 65 fields, saw 66\nSkipping line 528220: expected 65 fields, saw 66\nSkipping line 529331: expected 65 fields, saw 67\nSkipping line 529467: expected 65 fields, saw 67\nSkipping line 529650: expected 65 fields, saw 67\nSkipping line 529872: expected 65 fields, saw 67\nSkipping line 530068: expected 65 fields, saw 66\nSkipping line 530496: expected 65 fields, saw 66\nSkipping line 530687: expected 65 fields, saw 66\nSkipping line 531228: expected 65 fields, saw 66\nSkipping line 531279: expected 65 fields, saw 66\nSkipping line 531380: expected 65 fields, saw 66

b'Skipping line 568019: expected 65 fields, saw 67\nSkipping line 568067: expected 65 fields, saw 67\nSkipping line 568515: expected 65 fields, saw 66\nSkipping line 568795: expected 65 fields, saw 67\nSkipping line 569106: expected 65 fields, saw 66\nSkipping line 570338: expected 65 fields, saw 66\nSkipping line 570684: expected 65 fields, saw 66\nSkipping line 570827: expected 65 fields, saw 66\nSkipping line 571214: expected 65 fields, saw 67\nSkipping line 571370: expected 65 fields, saw 66\nSkipping line 571604: expected 65 fields, saw 67\nSkipping line 571892: expected 65 fields, saw 67\nSkipping line 571994: expected 65 fields, saw 67\nSkipping line 572022: expected 65 fields, saw 66\nSkipping line 572041: expected 65 fields, saw 67\nSkipping line 572124: expected 65 fields, saw 66\nSkipping line 572186: expected 65 fields, saw 66\nSkipping line 572717: expected 65 fields, saw 66\nSkipping line 572751: expected 65 fields, saw 66\nSkipping line 572943: expected 65 fields, saw 67

b'Skipping line 608840: expected 65 fields, saw 66\nSkipping line 608871: expected 65 fields, saw 66\nSkipping line 609223: expected 65 fields, saw 66\nSkipping line 609269: expected 65 fields, saw 67\nSkipping line 609386: expected 65 fields, saw 67\nSkipping line 609486: expected 65 fields, saw 66\nSkipping line 609922: expected 65 fields, saw 66\nSkipping line 610012: expected 65 fields, saw 66\nSkipping line 610133: expected 65 fields, saw 67\nSkipping line 610159: expected 65 fields, saw 67\nSkipping line 610543: expected 65 fields, saw 67\nSkipping line 610937: expected 65 fields, saw 66\nSkipping line 610963: expected 65 fields, saw 66\nSkipping line 610987: expected 65 fields, saw 67\nSkipping line 611289: expected 65 fields, saw 67\nSkipping line 611534: expected 65 fields, saw 67\nSkipping line 611672: expected 65 fields, saw 67\nSkipping line 611884: expected 65 fields, saw 67\nSkipping line 612398: expected 65 fields, saw 67\nSkipping line 612518: expected 65 fields, saw 67

b'Skipping line 649998: expected 65 fields, saw 67\nSkipping line 650202: expected 65 fields, saw 66\nSkipping line 650367: expected 65 fields, saw 66\nSkipping line 650640: expected 65 fields, saw 66\nSkipping line 650784: expected 65 fields, saw 67\nSkipping line 650847: expected 65 fields, saw 67\nSkipping line 650960: expected 65 fields, saw 67\nSkipping line 651336: expected 65 fields, saw 67\nSkipping line 651485: expected 65 fields, saw 66\nSkipping line 651591: expected 65 fields, saw 67\nSkipping line 651770: expected 65 fields, saw 67\nSkipping line 651981: expected 65 fields, saw 66\nSkipping line 652004: expected 65 fields, saw 66\nSkipping line 652216: expected 65 fields, saw 66\nSkipping line 652227: expected 65 fields, saw 67\nSkipping line 653140: expected 65 fields, saw 67\nSkipping line 653471: expected 65 fields, saw 67\nSkipping line 653547: expected 65 fields, saw 67\nSkipping line 653784: expected 65 fields, saw 66\nSkipping line 654128: expected 65 fields, saw 66

b'Skipping line 708138: expected 65 fields, saw 67\nSkipping line 708772: expected 65 fields, saw 66\nSkipping line 708939: expected 65 fields, saw 67\nSkipping line 709118: expected 65 fields, saw 67\nSkipping line 709345: expected 65 fields, saw 67\nSkipping line 709426: expected 65 fields, saw 67\nSkipping line 709496: expected 65 fields, saw 66\nSkipping line 709872: expected 65 fields, saw 66\nSkipping line 710158: expected 65 fields, saw 67\nSkipping line 710245: expected 65 fields, saw 66\nSkipping line 710520: expected 65 fields, saw 66\nSkipping line 710776: expected 65 fields, saw 66\nSkipping line 711096: expected 65 fields, saw 67\nSkipping line 711245: expected 65 fields, saw 67\nSkipping line 711340: expected 65 fields, saw 66\nSkipping line 711695: expected 65 fields, saw 67\nSkipping line 711805: expected 65 fields, saw 66\nSkipping line 711953: expected 65 fields, saw 66\nSkipping line 712278: expected 65 fields, saw 67\nSkipping line 712344: expected 65 fields, saw 67

b'Skipping line 756904: expected 65 fields, saw 67\nSkipping line 756945: expected 65 fields, saw 67\nSkipping line 757128: expected 65 fields, saw 67\nSkipping line 757267: expected 65 fields, saw 67\nSkipping line 757511: expected 65 fields, saw 66\nSkipping line 757894: expected 65 fields, saw 66\nSkipping line 758174: expected 65 fields, saw 66\nSkipping line 758531: expected 65 fields, saw 67\nSkipping line 758565: expected 65 fields, saw 67\nSkipping line 758682: expected 65 fields, saw 67\nSkipping line 758739: expected 65 fields, saw 67\nSkipping line 759165: expected 65 fields, saw 67\nSkipping line 759196: expected 65 fields, saw 67\nSkipping line 759284: expected 65 fields, saw 66\nSkipping line 759300: expected 65 fields, saw 66\nSkipping line 759532: expected 65 fields, saw 66\nSkipping line 759700: expected 65 fields, saw 66\nSkipping line 760306: expected 65 fields, saw 66\nSkipping line 760444: expected 65 fields, saw 66\nSkipping line 760556: expected 65 fields, saw 66

b'Skipping line 798124: expected 65 fields, saw 66\nSkipping line 798166: expected 65 fields, saw 67\nSkipping line 798481: expected 65 fields, saw 66\nSkipping line 798736: expected 65 fields, saw 67\nSkipping line 798787: expected 65 fields, saw 67\nSkipping line 799005: expected 65 fields, saw 67\nSkipping line 799042: expected 65 fields, saw 67\nSkipping line 799275: expected 65 fields, saw 66\nSkipping line 799417: expected 65 fields, saw 66\nSkipping line 800053: expected 65 fields, saw 67\nSkipping line 800085: expected 65 fields, saw 67\nSkipping line 800401: expected 65 fields, saw 66\nSkipping line 800419: expected 65 fields, saw 67\nSkipping line 800449: expected 65 fields, saw 67\nSkipping line 800518: expected 65 fields, saw 66\nSkipping line 800764: expected 65 fields, saw 67\nSkipping line 801096: expected 65 fields, saw 67\nSkipping line 801459: expected 65 fields, saw 67\nSkipping line 801891: expected 65 fields, saw 66\nSkipping line 802120: expected 65 fields, saw 67

b'Skipping line 847509: expected 65 fields, saw 67\nSkipping line 847677: expected 65 fields, saw 66\nSkipping line 847699: expected 65 fields, saw 66\nSkipping line 847976: expected 65 fields, saw 67\nSkipping line 848203: expected 65 fields, saw 66\nSkipping line 848519: expected 65 fields, saw 66\nSkipping line 848639: expected 65 fields, saw 66\nSkipping line 848709: expected 65 fields, saw 66\nSkipping line 848940: expected 65 fields, saw 67\nSkipping line 849200: expected 65 fields, saw 67\nSkipping line 849828: expected 65 fields, saw 67\nSkipping line 850065: expected 65 fields, saw 67\nSkipping line 850271: expected 65 fields, saw 66\nSkipping line 850419: expected 65 fields, saw 66\nSkipping line 850436: expected 65 fields, saw 66\nSkipping line 850572: expected 65 fields, saw 67\nSkipping line 850797: expected 65 fields, saw 67\nSkipping line 850917: expected 65 fields, saw 66\nSkipping line 850925: expected 65 fields, saw 66\nSkipping line 851616: expected 65 fields, saw 67

b'Skipping line 888679: expected 65 fields, saw 67\nSkipping line 888692: expected 65 fields, saw 66\nSkipping line 888824: expected 65 fields, saw 66\nSkipping line 889194: expected 65 fields, saw 67\nSkipping line 889346: expected 65 fields, saw 66\nSkipping line 889443: expected 65 fields, saw 66\nSkipping line 889450: expected 65 fields, saw 66\nSkipping line 889751: expected 65 fields, saw 67\nSkipping line 889834: expected 65 fields, saw 66\nSkipping line 890054: expected 65 fields, saw 67\nSkipping line 890228: expected 65 fields, saw 67\nSkipping line 890776: expected 65 fields, saw 66\nSkipping line 890993: expected 65 fields, saw 67\nSkipping line 891052: expected 65 fields, saw 66\nSkipping line 891181: expected 65 fields, saw 66\nSkipping line 891310: expected 65 fields, saw 67\nSkipping line 891385: expected 65 fields, saw 66\nSkipping line 891532: expected 65 fields, saw 66\nSkipping line 891604: expected 65 fields, saw 66\nSkipping line 891756: expected 65 fields, saw 66

b'Skipping line 929694: expected 65 fields, saw 66\nSkipping line 929970: expected 65 fields, saw 66\nSkipping line 930248: expected 65 fields, saw 67\nSkipping line 931504: expected 65 fields, saw 66\nSkipping line 931625: expected 65 fields, saw 66\nSkipping line 931816: expected 65 fields, saw 67\nSkipping line 931896: expected 65 fields, saw 67\nSkipping line 932006: expected 65 fields, saw 67\nSkipping line 932824: expected 65 fields, saw 67\nSkipping line 933209: expected 65 fields, saw 67\nSkipping line 933247: expected 65 fields, saw 66\nSkipping line 933357: expected 65 fields, saw 66\nSkipping line 933373: expected 65 fields, saw 66\nSkipping line 933449: expected 65 fields, saw 67\nSkipping line 933481: expected 65 fields, saw 66\nSkipping line 933499: expected 65 fields, saw 66\nSkipping line 933624: expected 65 fields, saw 67\nSkipping line 933682: expected 65 fields, saw 67\nSkipping line 933953: expected 65 fields, saw 67\nSkipping line 933978: expected 65 fields, saw 66

b'Skipping line 979097: expected 65 fields, saw 67\nSkipping line 979106: expected 65 fields, saw 66\nSkipping line 979235: expected 65 fields, saw 66\nSkipping line 979264: expected 65 fields, saw 67\nSkipping line 979509: expected 65 fields, saw 67\nSkipping line 979587: expected 65 fields, saw 66\nSkipping line 979792: expected 65 fields, saw 66\nSkipping line 980054: expected 65 fields, saw 67\nSkipping line 980163: expected 65 fields, saw 67\nSkipping line 980178: expected 65 fields, saw 67\nSkipping line 980216: expected 65 fields, saw 66\nSkipping line 980814: expected 65 fields, saw 66\nSkipping line 981258: expected 65 fields, saw 66\nSkipping line 981384: expected 65 fields, saw 67\nSkipping line 982113: expected 65 fields, saw 67\nSkipping line 982321: expected 65 fields, saw 66\nSkipping line 982469: expected 65 fields, saw 67\nSkipping line 982479: expected 65 fields, saw 66\nSkipping line 982766: expected 65 fields, saw 66\nSkipping line 982839: expected 65 fields, saw 67

b'Skipping line 1028800: expected 65 fields, saw 66\nSkipping line 1028802: expected 65 fields, saw 66\nSkipping line 1028878: expected 65 fields, saw 66\nSkipping line 1028930: expected 65 fields, saw 66\nSkipping line 1029070: expected 65 fields, saw 67\nSkipping line 1029269: expected 65 fields, saw 67\nSkipping line 1029383: expected 65 fields, saw 66\nSkipping line 1030444: expected 65 fields, saw 66\nSkipping line 1030546: expected 65 fields, saw 67\nSkipping line 1030649: expected 65 fields, saw 66\nSkipping line 1031009: expected 65 fields, saw 66\nSkipping line 1031169: expected 65 fields, saw 67\nSkipping line 1031298: expected 65 fields, saw 66\nSkipping line 1031476: expected 65 fields, saw 67\nSkipping line 1031507: expected 65 fields, saw 67\nSkipping line 1031635: expected 65 fields, saw 66\nSkipping line 1032169: expected 65 fields, saw 66\nSkipping line 1032239: expected 65 fields, saw 66\nSkipping line 1032263: expected 65 fields, saw 66\nSkipping line 1032289: expect

In [3]:
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,201902,2089776,19840630,34.54893908281998,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1172612,\N,170490.0
1,201902,2088434,19880109,31.02258726899384,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,...,0,\N,\N,947070.0,SIN INFO,\N,\N,\N,\N,41041.0
2,201902,4780572,19940208,24.93908281998631,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,...,0,7,0,1114487.0775,G,0,\N,\N,\N,959126.0
3,201902,2088089,19860727,32.47638603696099,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1187025.0,SIN INFO,\N,\N,\N,\N,187990.0
4,201902,3892351,19910108,28.02464065708419,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,...,0,\N,\N,4020204.37,SIN INFO,\N,70395,\N,\N,1323439.0


In [4]:
data.shape

(1042180, 65)

In [9]:
column_names = pd.read_csv("../data/header.txt").columns.to_list()
data.columns = column_names

In [10]:
data.head()

Unnamed: 0,periodo,id_cli,fecha_nacimiento,edad,genero,estado_civil,nivel_academico,profesion,ocupacion,tipo_vivienda,...,cuota_libranza_sf,cant_oblig_tot_sf,cant_cast_ult_12m_sr,ind,rep_calif_cred,pol_centr_ext,convenio_lib,ingreso_nomina,ingreso_segurida_social,gasto_familiar
0,201902,2089776,19840630,34.54893908281998,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1172612,\N,170490.0
1,201902,2088434,19880109,31.02258726899384,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,...,0,\N,\N,947070.0,SIN INFO,\N,\N,\N,\N,41041.0
2,201902,4780572,19940208,24.93908281998631,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,...,0,7,0,1114487.0775,G,0,\N,\N,\N,959126.0
3,201902,2088089,19860727,32.47638603696099,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1187025.0,SIN INFO,\N,\N,\N,\N,187990.0
4,201902,3892351,19910108,28.02464065708419,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,...,0,\N,\N,4020204.37,SIN INFO,\N,70395,\N,\N,1323439.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042180 entries, 0 to 1042179
Data columns (total 65 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   periodo                        1042180 non-null  int64  
 1   id_cli                         1042180 non-null  int64  
 2   fecha_nacimiento               1042180 non-null  int64  
 3   edad                           1042180 non-null  object 
 4   genero                         1042180 non-null  object 
 5   estado_civil                   1042180 non-null  object 
 6   nivel_academico                1042180 non-null  object 
 7   profesion                      1042180 non-null  object 
 8   ocupacion                      1042180 non-null  object 
 9   tipo_vivienda                  1042180 non-null  object 
 10  ult_actual                     1042180 non-null  int64  
 11  categoria                      1042180 non-null  object 
 12  codigo_ciiu   

In [11]:
import pandas as pd
data = pd.read_csv("../data/train_202002.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
data.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,ddd,eee,fff,ggg,hhh,iii,jjj,kkk,lll,mmm
0,201902,2089776,19840630,34.54893908281998,M,DIVORCIADO,TECNOLOGO,TECNOLOGIA SISTEMAS,Empleado,ALQUILADA,...,0,1,0,311306.0,C,0,\N,1172612,\N,170490.0
1,201902,2088434,19880109,31.02258726899384,F,UNION LIBRE,UNIVERSITARIO,VETERINARIA,Comerciante,NO INFORMA,...,0,\N,\N,947070.0,SIN INFO,\N,\N,\N,\N,41041.0
2,201902,4780572,19940208,24.93908281998631,M,SOLTERO,NO INFORMA,OTROS,Empleado,\N,...,0,7,0,1114487.0775,G,0,\N,\N,\N,959126.0
3,201902,2088089,19860727,32.47638603696099,M,UNION LIBRE,NO INFORMA,\N,Independiente,FAMILIAR,...,0,\N,\N,1187025.0,SIN INFO,\N,\N,\N,\N,187990.0
4,201902,3892351,19910108,28.02464065708419,M,SOLTERO,TECNOLOGO,OTROS,Independiente,\N,...,0,\N,\N,4020204.37,SIN INFO,\N,70395,\N,\N,1323439.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046645 entries, 0 to 1046644
Data columns (total 65 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   a       1046645 non-null  int64  
 1   b       1046645 non-null  int64  
 2   c       1046645 non-null  int64  
 3   d       1046645 non-null  object 
 4   e       1046645 non-null  object 
 5   f       1046645 non-null  object 
 6   g       1046645 non-null  object 
 7   h       1046645 non-null  object 
 8   i       1046645 non-null  object 
 9   j       1046645 non-null  object 
 10  k       1046645 non-null  int64  
 11  l       1046645 non-null  object 
 12  m       1046645 non-null  int64  
 13  n       1046645 non-null  object 
 14  o       1046645 non-null  object 
 15  p       1046645 non-null  object 
 16  q       1046645 non-null  object 
 17  r       1046645 non-null  object 
 18  s       1046645 non-null  object 
 19  t       1046645 non-null  object 
 20  u       1046645 non-null