In [1]:
import numpy as np # Fundamental package for linear algebra and multidimensional arrays
import pandas as pd # Data analysis and manipulation tool

# to ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
# In read_csv() function, we have passed the location to where the files are located in the dphi official github page.
train_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Tinder_Millennial_Match/train_set_label.csv")

In [3]:
train_data.head()

Unnamed: 0,ID,Segment type,Segment Description,Answer,Count,Percentage,It became a relationship
0,292890.897,web,"Meridian, Idaho",No,0.0,0.0,0
1,292887.987,web,"Meridian, Idaho",No,0.0,0.0,0
2,292894.0656,gender,"Meridian, Idaho",No,499.173606,0.225255,0
3,292887.118,web,"Meridian, Idaho",No,0.0,0.0,0
4,292893.6561,gender,"Meridian, Idaho",No,455.925963,0.21136,0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1896 entries, 0 to 1895
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        1896 non-null   float64
 1   Segment type              1896 non-null   object 
 2   Segment Description       1896 non-null   object 
 3   Answer                    1896 non-null   object 
 4   Count                     1896 non-null   float64
 5   Percentage                1896 non-null   float64
 6   It became a relationship  1896 non-null   int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 103.8+ KB


In [5]:
cat_train = train_data.select_dtypes('object')

In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')
encoded_train = ohe.fit_transform(cat_train).toarray()
train = pd.DataFrame(encoded_train, columns=ohe.get_feature_names(cat_train.columns))

In [7]:
train_data = train_data.drop(cat_train.columns, axis = 1)
train_data = pd.concat([train_data, train], axis = 1)

In [8]:
# Input/independent variables
X = train_data.drop('It became a relationship', axis = 1) # here we are dropping the target feature as this is the target and 'X' is input features, the changes are not
# made inplace as we have not used 'inplace = True'

y = train_data['It became a relationship'] # Output/Dependent variable

In [9]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [11]:
# split the data
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state = 42)

In [12]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred=model.predict(X_val)
print(accuracy_score(y_val, y_pred)*100)

88.04920913884007


In [13]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1)

In [14]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [15]:
y_pred=model.predict(X_val)
print(accuracy_score(y_val, y_pred)*100)

88.04920913884007


In [16]:
# Importing RandomForestClassifier from sklearn.ensemble
# We will be further discussing about why Random Forest is in ensemble module of sklearn library
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression()

In [18]:

lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
y_pred=lr.predict(X_val)
from sklearn.metrics import accuracy_score
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_val, y_pred)*100)

Accuracy: 58.69947275922671


In [20]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)

y_pred=model.predict(X_val)
print(accuracy_score(y_val, y_pred)*100)

88.04920913884007


In [21]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Tinder_Millennial_Match/test_set_label.csv')

In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632 entries, 0 to 631
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   632 non-null    float64
 1   Segment type         632 non-null    object 
 2   Segment Description  632 non-null    object 
 3   Answer               632 non-null    object 
 4   Count                632 non-null    float64
 5   Percentage           632 non-null    float64
dtypes: float64(3), object(3)
memory usage: 29.8+ KB


In [23]:
cat_test = test_data.select_dtypes('object')

In [24]:
encoded_test = ohe.transform(cat_test).toarray()
test = pd.DataFrame(encoded_test, columns=ohe.get_feature_names(cat_test.columns))

In [25]:
test_data = test_data.drop(cat_test.columns, axis = 1)
test_data = pd.concat([test_data, test], axis = 1)

In [26]:
target = lr.predict(test_data)

In [27]:
# To create Dataframe of predicted value with particular respective index
res = pd.DataFrame(target) # target are nothing but the final predictions of your model on input features of your new unseen test data
res.columns = ["prediction"]

# To download the csv file locally
from google.colab import files
res.to_csv('submission.csv')
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>