## <font color='black'>Purpose: 
Wouldn't it be amazing if you could be certain your marriage will be successful?
Let us help you figure out if this is the right person for you!</font>

### <font color='gray'>What You Need to Do:
    Simply answer a few quick questions and we will predict if you're destined for marital bliss.

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
fileName = 'marriage.csv'
marriage_raw = pd.read_csv(fileName, header =0)

In [3]:
marriage_raw.shape

(2020, 163)

In [4]:
marriage_raw['gen'].head()

0    2
1    1
2    2
3    3
4    2
Name: gen, dtype: int64

In [5]:
#select the columns we want
marriage1 = marriage_raw[['m2','state','usr1','spouse','qs1','marital','m4','lwp3','e3','q1','q15a','q15b','q15c','q15d','q15e','q15f','q15g','q15h','q15i','q16','e5','fam2','ownrent','relig','income','ideo','receduc','racethn','gen']]

In [6]:
marriage1.head()

Unnamed: 0,m2,state,usr1,spouse,qs1,marital,m4,lwp3,e3,q1,...,q16,e5,fam2,ownrent,relig,income,ideo,receduc,racethn,gen
0,,36,2,2.0,1.0,1,,,1,2,...,2,1.0,1,1,9,4,4,4,1,2
1,,24,2,1.0,,1,,1.0,1,2,...,2,3.0,1,1,1,7,3,3,1,1
2,,44,2,1.0,,1,,,9,2,...,2,9.0,1,1,1,4,4,2,1,2
3,,39,1,,,3,,,1,2,...,2,,1,1,1,5,3,4,1,3
4,2.0,12,2,,,2,,,2,2,...,2,,1,1,8,7,5,4,1,2


In [7]:
#rename columns so we can understand
marriage1 = marriage1.rename(columns={'m2':'everMarried', 'state':'state','usr1': 'community', 'spouse': 'whichSpouse', 'qs1': 'age', 'marital': 'marriageStatus', 'm4': 'everDivorced', 'lwp3': 'livedTogether', 'e3': 'employment', 'q1': 'lifeSatisfaction','q15a':'faithfulness','q15b':'adeqIncome', 'q15c':'shareRelig', 'q15d':'goodHouse', 'q15e':'agreePolitics', 'q15f':'goodSexLife', 'q15g':'shareChores', 'q15h':'children', 'q15i':'commonInterests', 'q16':'divorceViews', 'e5':'spouseEmployment', 'fam2':'parentMaritalStatus', 'ownrent':'ownrent', 'relig':'religion', 'income':'incomepastyr', 'ideo':'politicalViews', 'receduc':'educationLevel', 'raceeth':'racethn','gen':'generation'})

In [8]:
#move everDivorced to end since it is the target
features = list(marriage1.columns.values)
features.pop(features.index('marriageStatus'))
features.pop(features.index('everDivorced'))
marriage1 = marriage1[['marriageStatus']+features+['everDivorced']]

In [9]:
#the only rows that have 0 as a valid value are q15a-i
#therefore set all blank values in the remaining columns to 0s
marriage1 = marriage1.replace(' ', 0)

In [10]:
marriage1 = marriage1.apply(pd.to_numeric, errors='ignore')

In [11]:
#make sure we converted all columns to integers
marriage1.dtypes

marriageStatus         int64
everMarried            int64
state                  int64
community              int64
whichSpouse            int64
age                    int64
livedTogether          int64
employment             int64
lifeSatisfaction       int64
faithfulness           int64
adeqIncome             int64
shareRelig             int64
goodHouse              int64
agreePolitics          int64
goodSexLife            int64
shareChores            int64
children               int64
commonInterests        int64
divorceViews           int64
spouseEmployment       int64
parentMaritalStatus    int64
ownrent                int64
religion               int64
incomepastyr           int64
politicalViews         int64
educationLevel         int64
racethn                int64
generation             int64
everDivorced           int64
dtype: object

In [12]:
#check if they didnt respond to marriage status if they did respond to the other question
marriage1.loc[(marriage1['marriageStatus']==9) & (marriage1['everDivorced'].notnull())]

Unnamed: 0,marriageStatus,everMarried,state,community,whichSpouse,age,livedTogether,employment,lifeSatisfaction,faithfulness,...,spouseEmployment,parentMaritalStatus,ownrent,religion,incomepastyr,politicalViews,educationLevel,racethn,generation,everDivorced
28,9,0,36,1,0,0,0,2,1,1,...,0,5,2,2,10,2,2,3,9,0
224,9,0,6,1,0,0,0,3,3,1,...,0,1,1,2,3,1,3,1,3,0
250,9,0,6,1,0,0,0,1,9,2,...,0,1,2,8,10,9,4,1,3,0
773,9,0,6,2,0,0,0,3,1,1,...,0,1,3,1,6,1,2,3,2,0
803,9,0,36,1,0,0,0,1,2,1,...,0,1,2,8,10,9,2,3,1,0
918,9,0,42,3,0,0,0,1,1,1,...,0,2,2,1,4,3,3,9,3,0
997,9,0,22,3,0,1,0,1,2,1,...,0,1,1,1,2,1,2,2,3,0
1148,9,0,6,2,0,0,0,3,1,1,...,0,1,2,1,10,2,3,1,3,0
1293,9,0,26,1,0,0,0,3,2,1,...,0,1,1,8,10,9,3,1,3,0
1307,9,0,36,1,0,0,0,3,2,1,...,0,1,1,9,10,9,3,2,3,0


In [13]:
#remove all rows where MarriageStatus = 6 - never been married or 9 - don't know/refused
marriage1 = marriage1.drop(marriage1[(marriage1.marriageStatus == 6) | (marriage1.marriageStatus == 9)].index)

In [14]:
#for all rows where marriage status is 3 - divorced or 4 - separated enter 1 in everDivorced
marriage1.loc[marriage1['marriageStatus'] == 3, 'everDivorced'] = 1
marriage1.loc[marriage1['marriageStatus'] == 4, 'everDivorced'] = 1

In [15]:
#for all rows where everMarried = 1 and marriage status = 2 enter 1 in everDivorced
#we are assuming here that if you've been married and current status is living together then you've been divorced
marriage1.loc[(marriage1['everMarried']==1) & (marriage1['marriageStatus']==2),'everDivorced'] = 1

In [16]:
#for all rows where marriage status is 1 - married or 2 - living together or 5 - widowed enter 2 in everDivorced
marriage1.loc[((marriage1['marriageStatus'].isin([1,2,5])) & (marriage1['everDivorced']==0)),'everDivorced'] = 2

In [17]:
#check no blank values for everDivorced
marriage1.loc[(marriage1['everDivorced'] == 0)]

Unnamed: 0,marriageStatus,everMarried,state,community,whichSpouse,age,livedTogether,employment,lifeSatisfaction,faithfulness,...,spouseEmployment,parentMaritalStatus,ownrent,religion,incomepastyr,politicalViews,educationLevel,racethn,generation,everDivorced


In [18]:
marriage1.head()

Unnamed: 0,marriageStatus,everMarried,state,community,whichSpouse,age,livedTogether,employment,lifeSatisfaction,faithfulness,...,spouseEmployment,parentMaritalStatus,ownrent,religion,incomepastyr,politicalViews,educationLevel,racethn,generation,everDivorced
0,1,0,36,2,2,1,0,1,2,1,...,1,1,1,9,4,4,4,1,2,2
1,1,0,24,2,1,0,1,1,2,2,...,3,1,1,1,7,3,3,1,1,2
2,1,0,44,2,1,0,0,9,2,2,...,9,1,1,1,4,4,2,1,2,2
3,3,0,39,1,0,0,0,1,2,1,...,0,1,1,1,5,3,4,1,3,1
4,2,2,12,2,0,0,0,2,2,1,...,0,1,1,8,7,5,4,1,2,2


In [19]:
marriage1.shape

(1594, 29)

In [20]:
#the features are everything but everdivorced, maritial status and everMarried
features =  list(marriage1.columns[2:28])

In [21]:
features

['state',
 'community',
 'whichSpouse',
 'age',
 'livedTogether',
 'employment',
 'lifeSatisfaction',
 'faithfulness',
 'adeqIncome',
 'shareRelig',
 'goodHouse',
 'agreePolitics',
 'goodSexLife',
 'shareChores',
 'children',
 'commonInterests',
 'divorceViews',
 'spouseEmployment',
 'parentMaritalStatus',
 'ownrent',
 'religion',
 'incomepastyr',
 'politicalViews',
 'educationLevel',
 'racethn',
 'generation']

In [22]:
#create the decision tree classifier
y = marriage1['everDivorced']
X = marriage1[features]

In [23]:
y.mean()

1.64366373902133

In [53]:
#break the data up into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [54]:
clf10TREE = RandomForestClassifier(n_estimators=12)

In [55]:
clf10TREE = clf10TREE.fit(X_train, y_train)

In [56]:
#model accuracy
clf10TREE.score(X_train, y_train)

0.99113737075332353

In [57]:
test_prediction = clf10TREE.predict(X_test)

In [58]:
#check accuracy testing
print (accuracy_score(y_test, test_prediction))

0.720833333333


In [59]:
states={1:'alabama',
2:'alaska' ,
4:'arizona',
5:'arkansas', 
6:'california',     
8:'colorado' ,
9:'connecticut',  
10:'delaware',
11:'district of columbia',
12:'florida',
13:'georgia',
15:'hawaii',
16:'idaho',
17:'illinois',
18:'indiana',
19:'iowa',
20:'kansas', 
21:'kentucky', 
22:'louisiana',
23:'maine',
24:'maryland',
25:'massachusetts',  
26:'michigan',
27:'minnesota',
28:'mississippi',   
29:'missouri',
30:'montana',
31:'nebraska', 
32:'nevada',
33:'new hampshire',  
34:'new jersey',     
35:'new mexico',    
36:'new york',
37:'north carolina', 
38:'north dakota',
39:'ohio',
40:'oklahoma', 
41:'oregon',
42:'pennsylvania', 
44:'rhode island',  
45:'south carolina',
46:'south dakota',   
47:'tennessee',
48:'texas',
49:'utah', 
50:'vermont',
51:'virginia', 
53:'washington',     
54:'west virginia',  
55:'wisconsin',
56:'wyoming'}

In [51]:
pstate = 0
instate = input('Please enter the state you live in\n').lower()
for key, value in states.items():
    if value == instate:
        pstate = key

Please enter the state you live in
vermont


50


input_predict = (clf10TREE.predict([[]]))