# __Kaggle Competition__

In [1]:
import pandas as pd
import numpy as np

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Decision Tree and Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
# or for faster training with large datasets
from sklearn.ensemble import HistGradientBoostingClassifier 

# Neural Networks
from sklearn.neural_network import MLPClassifier

### __Data Processing__

In [2]:
acc_df = pd.read_csv('./for_students/account.csv', encoding='latin1')
con1415_df = pd.read_csv('./for_students/concerts_2014-15.csv')
con_df = pd.read_csv('./for_students/concerts.csv')
sub_df = pd.read_csv('./for_students/subscriptions.csv')
tix_df = pd.read_csv('./for_students/tickets_all.csv')
zip_df = pd.read_csv('./for_students/zipcodes.csv')

train_df = pd.read_csv('./for_students/train.csv')
test_df = pd.read_csv('./for_students/test.csv')

### What the output should look like

In [3]:
sample_df = pd.read_csv('./for_students/sample_submission.csv')

print(sample_df.dtypes)
print(f"Rows x Columns: {sample_df.shape[0]} x {sample_df.shape[1]}")

nan_count = (sample_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

sample_df.head()

ID            object
Predicted    float64
dtype: object
Rows x Columns: 2975 x 2
NaNs: 0


Unnamed: 0,ID,Predicted
0,001i000000NuQ6Y,0.5
1,001i000000NuQXz,0.5
2,001i000000NuRDC,0.5
3,001i000000NuQeg,0.5
4,001i000000NuOQc,0.5


### concerts_2014-15.csv

In [4]:
print(con1415_df.dtypes)
print(f"Rows x Columns: {con1415_df.shape[0]} x {con1415_df.shape[1]}")

nan_count = (con1415_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

con1415_df


season          object
concert.name    object
set              int64
who             object
what            object
dtype: object
Rows x Columns: 6 x 5
NaNs: 0


Unnamed: 0,season,concert.name,set,who,what
0,2014-2015,"Steven Isserlis, Boccherini, and Haydn",1,"Nicholas McGegan, conductor, Steven Isserlis, ...",BOCCHERINIConcerto for Violoncello No. 7 in G ...
1,2014-2015,"Andreas Scholl, J.S. Bach, and Handel",2,"Julian Wachner, conductor, Andreas Scholl, cou...",HANDELArias including ÒVa tacitoÓ from Giulio ...
2,2014-2015,Vivaldi and Zelenka: A Joyous Christmas,3,"Nicholas McGegan, conductor, Dominique Labelle...","VIVALDI Dixit Dominus, RV 807 ZELENKA Missa Na..."
3,2014-2015,The Cousins Bach,4,"Nicholas McGegan, conductor Sherezade Panthaki...","Johann Ludwig BACH, Trauermusik, Johann Christ..."
4,2014-2015,Rachel Podger and Vivaldi,5,"Rachel Podger, violin and leader",VIVALDI Violin concertos from LÕestro armonico...
5,2014-2015,RossiniÕs The Marriage Contract,6,"Nicholas McGegan, conductor Soloists from San ...",ROSSINI La cambiale di matrimonio (The Marriag...


### concerts.csv

In [5]:
print(con_df.dtypes)
print(f"Rows x Columns: {con_df.shape[0]} x {con_df.shape[1]}")

nan_count = (con_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

con_df.head()

con_df_clean = con_df.dropna()
con_df_clean

season          object
concert.name    object
set              int64
who             object
what            object
location        object
dtype: object
Rows x Columns: 103 x 6
NaNs: 3


Unnamed: 0,season,concert.name,set,who,what,location
0,2010-2011,Robert Levin Plays Mozart,1,"Nicholas McGegan, conductor\rRobert Levin, for...","MOZART: Incidental Music from Thamos, King of ...",Peninsula
1,2010-2011,Robert Levin Plays Mozart,1,"Nicholas McGegan, conductor\rRobert Levin, for...","MOZART: Incidental Music from Thamos, King of ...",Berkeley Sunday
2,2010-2011,Robert Levin Plays Mozart,1,"Nicholas McGegan, conductor\rRobert Levin, for...","MOZART: Incidental Music from Thamos, King of ...",San Francisco
3,2010-2011,Robert Levin Plays Mozart,1,"Nicholas McGegan, conductor\rRobert Levin, for...","MOZART: Incidental Music from Thamos, King of ...",Berkeley Saturday
4,2010-2011,Robert Levin Plays Mozart,1,"Nicholas McGegan, conductor\rRobert Levin, for...","MOZART: Incidental Music from Thamos, King of ...",Contra Costa
...,...,...,...,...,...,...
98,2013-2014,Music from the Heart of Europe,5,"Elizabeth Blumenstock, violin and leader\rPhil...",GEORG MUFFAT\rFasciculus I ÒNobilis JuventusÓ ...,Berkeley Saturday
99,2013-2014,Music from the Heart of Europe,5,"Elizabeth Blumenstock, violin and leader\rPhil...",GEORG MUFFAT\rFasciculus I ÒNobilis JuventusÓ ...,Berkeley Sunday
100,2013-2014,Vivaldi's Venice,6,"Nicholas McGegan, conductor\rCcile van de San...",VIVALDI\rJuditha triumphans devicta Holofernis...,Berkeley Sunday
101,2013-2014,Vivaldi's Venice,6,"Nicholas McGegan, conductor\rCcile van de San...",VIVALDI\rJuditha triumphans devicta Holofernis...,San Francisco


### zipcodes.csv

In [6]:
print(zip_df.dtypes)

nan_count = (zip_df.isna().sum(axis=1) > 0).sum()
print(f"Old NaNs: {nan_count}")

zip_df_clean = zip_df[zip_df['Decommisioned'] == False]
zip_df_clean = zip_df_clean.drop(columns=['Lat', 'Long', 'Location', 'Decommisioned'])
zip_df_clean.fillna(-1.0, inplace=True)

nan_count = (zip_df_clean.isna().sum(axis=1) > 0).sum()
print(f"New NaNs: {nan_count}")
zip_df_clean

Zipcode                  int64
ZipCodeType             object
City                    object
State                   object
LocationType            object
Lat                    float64
Long                   float64
Location                object
Decommisioned             bool
TaxReturnsFiled        float64
EstimatedPopulation    float64
TotalWages             float64
dtype: object
Old NaNs: 13678
New NaNs: 0


Unnamed: 0,Zipcode,ZipCodeType,City,State,LocationType,TaxReturnsFiled,EstimatedPopulation,TotalWages
0,705,STANDARD,AIBONITO,PR,PRIMARY,-1.0,-1.0,-1.0
1,610,STANDARD,ANASCO,PR,PRIMARY,-1.0,-1.0,-1.0
2,611,PO BOX,ANGELES,PR,PRIMARY,-1.0,-1.0,-1.0
3,612,STANDARD,ARECIBO,PR,PRIMARY,-1.0,-1.0,-1.0
4,601,STANDARD,ADJUNTAS,PR,PRIMARY,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...
41897,56945,UNIQUE,PARCEL RETURN SERVICE,DC,PRIMARY,-1.0,-1.0,-1.0
41898,56950,UNIQUE,PARCEL RETURN SERVICE,DC,PRIMARY,-1.0,-1.0,-1.0
41899,56965,UNIQUE,PARCEL RETURN SERVICE,DC,PRIMARY,-1.0,-1.0,-1.0
41900,56972,UNIQUE,PARCEL RETURN SERVICE,DC,PRIMARY,-1.0,-1.0,-1.0


### account.csv

In [7]:
print(acc_df.dtypes)

nan_count = (acc_df.isna().sum(axis=1) > 0).sum()
print(f"Old NaNs: {nan_count}")

acc_df_clean = acc_df

# Fill shipping city with billing city if shipping city is NaN and billing city is not NaN
acc_df_clean['shipping.city'] = acc_df_clean.apply(lambda row: row['billing.city'] if pd.isna(row['shipping.city']) and not pd.isna(row['billing.city']) else row['shipping.city'], axis=1)
# Fill billing city with shipping city if billing city is NaN and shipping city is not NaN
acc_df_clean['billing.city'] = acc_df_clean.apply(lambda row: row['shipping.city'] if pd.isna(row['billing.city']) and not pd.isna(row['shipping.city']) else row['billing.city'], axis=1)
# Fill shipping zip code with billing zip code if shipping zip code is NaN and billing zip code is not NaN
acc_df_clean['shipping.zip.code'] = acc_df_clean.apply(lambda row: row['billing.zip.code'] if pd.isna(row['shipping.zip.code']) and not pd.isna(row['billing.zip.code']) else row['shipping.zip.code'], axis=1)
# Fill billing zip code with shipping zip code if billing zip code is NaN and shipping zip code is not NaN
acc_df_clean['billing.zip.code'] = acc_df_clean.apply(lambda row: row['shipping.zip.code'] if pd.isna(row['billing.zip.code']) and not pd.isna(row['shipping.zip.code']) else row['billing.zip.code'], axis=1)
acc_df_clean = acc_df_clean.drop(columns=['first.donated', 'relationship'])
acc_df_clean = acc_df_clean.dropna()

nan_count = (acc_df_clean.isna().sum(axis=1) > 0).sum()
print(f"New NaNs: {nan_count}")

acc_df_clean

account.id                  object
shipping.zip.code           object
billing.zip.code            object
shipping.city               object
billing.city                object
relationship                object
amount.donated.2013        float64
amount.donated.lifetime    float64
no.donations.lifetime        int64
first.donated               object
dtype: object
Old NaNs: 19832
New NaNs: 0


Unnamed: 0,account.id,shipping.zip.code,billing.zip.code,shipping.city,billing.city,amount.donated.2013,amount.donated.lifetime,no.donations.lifetime
0,001i000000LhSrQ,94102,94102,San Francisco,San Francisco,0.0,0.0,0
1,001i000000LhyPD,94303,94303,Palo Alto,Palo Alto,0.0,500.0,5
2,001i000000LhyPE,94597,94597,Walnut Creek,Walnut Creek,0.0,783.0,7
3,001i000000LhyPF,94610,94610,Oakland,Oakland,1500.0,28435.0,28
4,001i000000LhyPG,94024,94024,Los Altos,Los Altos,300.0,4969.0,16
...,...,...,...,...,...,...,...,...
19828,001i0000018BZTP,95014,95014,Cupertino,Cupertino,0.0,0.0,0
19829,001i0000018BZTt,94521,94521,Concord,Concord,0.0,0.0,0
19830,001i0000018BZUN,94552,94552,castro valley,castro valley,0.0,0.0,0
19831,001i0000018BZUr,94043,94043,Mountain view,Mountain view,0.0,0.0,0


### subscriptions.csv

In [8]:
print(sub_df.dtypes)

nan_count = (sub_df.isna().sum(axis=1) > 0).sum()
print(f"Old NaNs: {nan_count}")

sub_df_clean = sub_df.dropna(subset=['location'])
sub_df_clean = sub_df_clean.dropna(subset=['section'])
sub_df_clean.fillna(-1.0, inplace=True)

nan_count = (sub_df_clean.isna().sum(axis=1) > 0).sum()
print(f"New NaNs: {nan_count}")

sub_df_clean

account.id            object
season                object
package               object
no.seats               int64
location              object
section               object
price.level          float64
subscription_tier    float64
multiple.subs         object
dtype: object
Old NaNs: 4544
New NaNs: 0


Unnamed: 0,account.id,season,package,no.seats,location,section,price.level,subscription_tier,multiple.subs
0,001i000000LhyR3,2009-2010,Quartet,2,San Francisco,Premium Orchestra,1.0,1.0,no
1,001i000000NuOeY,2000-2001,Full,2,San Francisco,Orchestra,2.0,2.0,no
2,001i000000NuNvb,2001-2002,Full,2,Berkeley Saturday,Balcony Front,3.0,2.0,no
3,001i000000NuOIz,1993-1994,Quartet,1,Contra Costa,Orchestra,2.0,0.5,no
4,001i000000NuNVE,1998-1999,Full,2,Berkeley Sunday,Balcony Rear,4.0,2.0,no
...,...,...,...,...,...,...,...,...,...
28622,001i000000NuOE8,1994-1995,Full,3,Santa Rosa,Balcony,4.0,3.0,no
28623,001i000000NuPnA,2006-2007,Full,2,Peninsula,Balcony Front,3.0,2.0,no
28624,001i000000Lhyc6,2009-2010,Full,4,San Francisco,Dress Circle,3.0,4.0,no
28625,001i000000NuOhT,1995-1996,Full,2,Santa Rosa,Balcony,4.0,2.0,no


### tickets_all.csv

In [9]:
print(tix_df.dtypes)

nan_count = (tix_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

tix_df_clean = tix_df.dropna(subset=['location'])
tix_df_clean = tix_df_clean.dropna(subset=['set'])
tix_df_clean = tix_df_clean.fillna({'marketing.source': 'unknown'})
tix_df_clean.fillna(-1.0, inplace=True)

nan_count = (tix_df_clean.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

tix_df_clean

account.id           object
price.level          object
no.seats              int64
marketing.source     object
season               object
location             object
set                 float64
multiple.tickets     object
dtype: object
NaNs: 2226
NaNs: 0


Unnamed: 0,account.id,price.level,no.seats,marketing.source,season,location,set,multiple.tickets
0,001i000000NuOP0,4,2,unknown,2012-2013,San Francisco,4.0,no
1,001i000000NuOYU,1,1,Other,2013-2014,Berkeley Sunday,6.0,no
2,001i000000NuRS9,4,2,unknown,2011-2012,Family concert,3.0,no
3,001i000000NuRPj,3,1,Postcard or brochure,2013-2014,Berkeley Saturday,2.0,no
4,001i000000NuRQ1,2,2,unknown,2013-2014,Berkeley Saturday,1.0,no
...,...,...,...,...,...,...,...,...
2803,001i000000NuRUn,-1.0,4,unknown,2012-2013,Berkeley Saturday,5.0,yes
2804,001i000000NuOP0,4.0,2,unknown,2012-2013,San Francisco,4.0,no
2805,001i000000NuOYU,-1.0,1,Other,2013-2014,Berkeley Sunday,6.0,no
2806,001i000000NuRS9,4.0,2,unknown,2011-2012,Family concert,3.0,no


# __Tentative right now on whether we should combine the dataframes. Will keep them separate for now (going to use ensemble!)__

### __Modeling__

Here are some popular models for binary classification tasks, each with unique strengths depending on your data and goals:

1. **Logistic Regression**  
   - Simple yet powerful for linear relationships.
   - Interpretable, with coefficients indicating feature importance.
   
2. **Support Vector Machine (SVM)**  
   - Effective with clear class boundaries and works well with high-dimensional data.
   - Allows for both linear and non-linear classification with the kernel trick.

3. **Decision Trees**  
   - Highly interpretable, handling non-linear relationships.
   - Prone to overfitting, but this can be managed with pruning or using ensemble methods.

4. **Random Forest**  
   - An ensemble of decision trees that improves stability and reduces overfitting.
   - Great for handling complex data with many features.

5. **Gradient Boosting Models (e.g., XGBoost, LightGBM, CatBoost)**  
   - Often yield state-of-the-art results for structured/tabular data.
   - Effective at handling imbalanced data and capturing complex relationships.

6. **K-Nearest Neighbors (KNN)**  
   - Simple, instance-based approach for smaller datasets.
   - Works best when there is a clear clustering of data points by class.

7. **Neural Networks (especially shallow ones for binary classification)**  
   - Useful for complex patterns, though a simpler structure is often enough for binary tasks.
   - More suited for larger datasets due to training complexity.

8. **Naïve Bayes**  
   - Fast and effective for text classification and cases where independence assumptions roughly hold.
   - Performs well even with smaller datasets.

Each model has strengths for different data types and should be tested and validated to determine the best fit for your specific binary classification task.

In [10]:
print(train_df.dtypes)
print(f"Rows x Columns: {train_df.shape[0]} x {train_df.shape[1]}")

nan_count = (train_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

train_df.head()

account.id    object
label          int64
dtype: object
Rows x Columns: 6941 x 2
NaNs: 0


Unnamed: 0,account.id,label
0,001i000000NuRo3,0
1,001i000000NuRxd,0
2,001i000000NuQGN,0
3,001i000000NuPfL,0
4,001i000000NuQkP,0


In [11]:
print(test_df.dtypes)
print(f"Rows x Columns: {test_df.shape[0]} x {test_df.shape[1]}")

nan_count = (test_df.isna().sum(axis=1) > 0).sum()
print(f"NaNs: {nan_count}")

test_df.head()

ID    object
dtype: object
Rows x Columns: 2975 x 1
NaNs: 0


Unnamed: 0,ID
0,001i000000NuQ6Y
1,001i000000NuQXz
2,001i000000NuRDC
3,001i000000NuQeg
4,001i000000NuOQc


In [12]:
# Lots of NaNs to deal with somehow (clean that data!)

# implement logistic regression -> trees/forest -> grad boost -> NNs
