In [16]:
#Importing required modules.
import pandas as pd
from rootpath import detect

In [47]:
#Getting path of data file.
basePath = detect()
dataPath = basePath + '/bank-full.csv'

In [50]:
#Reading data into a dataframe, since data is pretty small we should have no memory problems.
dataDF = pd.read_csv(dataPath, sep = ';')
#Checking the DF
dataDF.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


For reference:

## Bank client data:

   *  age (numeric)
   *  job : type of job (categorical:"admin.","unknown","unemployed","management","housemaid","entrepreneur","student","blue-collar","self-employed","retired","technician","services") 
   *  marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
   *  education (categorical: "unknown","secondary","primary","tertiary")
   *  default: has credit in default? (binary: "yes","no")
   *  balance: average yearly balance, in euros (numeric) 
   *  housing: has housing loan? (binary: "yes","no")
   *  loan: has personal loan? (binary: "yes","no")

## Related with the last contact of the current campaign:

   *  contact: contact communication type (categorical: "unknown","telephone","cellular") 
   *  day: last contact day of the month (numeric)
   *  month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
   *  duration: last contact duration, in seconds (numeric)
   
## Other attributes:

   *  campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
   *  pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
   *  previous: number of contacts performed before this campaign and for this client (numeric)
   *  poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

## Output variable (desired target):

   *  has the client subscribed a term deposit? (binary: "yes","no")


## Question 1: Which job is more likely to make a loan? Of which type?

In [118]:
#Looking at main statistics for housing and personal loans
groupedByDF = dataDF[['job','housing']].groupby(['job']).agg({'housing': 'count'}).reset_index().\
                        rename({'housing': 'total_per_job'}, axis = 1)
groupedByDFHousing = dataDF.loc[partialDF['housing']=='yes',['job', 'housing']].\
                        groupby(['job']).agg({'housing': 'count'})
groupedByDFLoan = dataDF.loc[partialDF['loan']=='yes',['job', 'loan']].\
                        groupby(['job']).agg({'loan': 'count'})

groupedByDF = groupedByDF.join(groupedByDFHousing, on = 'job', how = 'inner')
groupedByDF = groupedByDF.join(groupedByDFLoan, on = 'job', how = 'inner')

groupedByDF['housing_percentage'] = (groupedByDF['housing'] / groupedByDF['total_per_job'])*100
groupedByDF['loan_percentage'] = (groupedByDF['loan'] / groupedByDF['total_per_job'])*100
groupedByDF['total_number'] = groupedByDF['loan'] + groupedByDF['housing']
groupedByDF['total_percentage'] = ((groupedByDF['loan'] + groupedByDF['housing']) / groupedByDF['total_per_job'])*100

#Showing results ordered by percentage of loans
groupedByDF = groupedByDF.sort_values('total_percentage', ascending = False)

print(groupedByDF)

              job  total_per_job  housing  loan  housing_percentage  \
1     blue-collar           9732     7048  1684           72.420880   
7        services           4154     2766   836           66.586423   
2    entrepreneur           1487      869   356           58.439812   
0          admin.           5171     3182   991           61.535486   
9      technician           7597     4115  1309           54.166118   
6   self-employed           1579      765   229           48.448385   
4      management           9458     4678  1253           49.460774   
10     unemployed           1303      543   109           41.673062   
3       housemaid           1240      398   152           32.096774   
5         retired           2264      491   309           21.687279   
8         student            938      249    12           26.545842   
11        unknown            288       26     4            9.027778   

    loan_percentage  total_number  total_percentage  
1         17.303740   

In [119]:
#Showing results ordered by total number of loans
groupedByDF = groupedByDF.sort_values('total_number', ascending = False)

print(groupedByDF)

              job  total_per_job  housing  loan  housing_percentage  \
1     blue-collar           9732     7048  1684           72.420880   
4      management           9458     4678  1253           49.460774   
9      technician           7597     4115  1309           54.166118   
0          admin.           5171     3182   991           61.535486   
7        services           4154     2766   836           66.586423   
2    entrepreneur           1487      869   356           58.439812   
6   self-employed           1579      765   229           48.448385   
5         retired           2264      491   309           21.687279   
10     unemployed           1303      543   109           41.673062   
3       housemaid           1240      398   152           32.096774   
8         student            938      249    12           26.545842   
11        unknown            288       26     4            9.027778   

    loan_percentage  total_number  total_percentage  
1         17.303740   

Answer: By aggregating the results as shown above, we can see that the blue-collar profession is the one with the highest number of successful loans, both in number and percentage. Therefore, a blue-collar is more likely to order a loan than other customers from other jobs. The type of loan preferred by blue-collars is housing loans, which 72% of blue-collar customers in this database order as a bank product.

## Observing the variables 'number of contacts for a given campaign' and 'campaign success', what are the main relevant points?