In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import random
from sklearn import svm, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve

In [2]:
ATTRIBUTE_DICT = {"SeriousDlqin2yrs" : "Has Had Serious Delinquincy in Past 2 Years",\
 "NumberOfTimes90DaysLate": "Number of Times Person was 90+ Days Late", \
 "RevolvingUtilizationOfUnsecuredLines" : "Credit Card Usage", \
 "NumberOfTime30-59DaysPastDueNotWorse": "Number of Times 30-59 Days Past Due", \
 "DebtRatio": "Debt Ration", \
 "NumberOfDependents": "Number of Dependents", \
 "MonthlyIncome": "Monthly Income", \
 "NumberOfOpenCreditLinesAndLoans" : "Number of Open Credit Lines and Loans", \
 "NumberRealEstateLoansOrLines": "Number of Real Estate Loans or Lines", \
 "NumberOfTime60-89DaysPastDueNotWorse": "Number of Times 60-89 Days Past Due", \
 "age": "Age"}

In [3]:
def read_in (path, source_type, has_index=False):
    '''
    Read in data.
    
    Takes path to data file, string indicating type of data storage,
    and indicator of whether data includes an index (N/A for json)
    
    Returns pandas DataFrame
    '''
    if source_type.lower() == "csv":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_csv(path, index_col=index_col_number)
    if source_type.lower() == "excel":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_excel(path, index_col=index_col_number)
    if source_type.lower() == "json":
        return pd.read_json(path)
    if source_type.lower() == "stata":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_csv(path, index=index_col_number)

In [4]:
def get_description(df, save_to_file_name):
    df1 = df.describe().T
    modes = df.mode().T
    modes.rename(columns={0 : "mode"}, inplace = True)
    medians = pd.DataFrame(df.median())
    medians.rename(columns={0 : "median"}, inplace = True)
    stats_final = df1.join(modes, how="left").join(medians, how="left")
    stats_final.to_csv(save_to_file_name)
    return

In [32]:
def histo_plot(df, x_label, y_label, title, bins, axis):
    plt.hist(df, bins)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.axis(axis)
    plt.grid(True)
    fig = plt.figure()
    fig.savefig("results/" + title + ".png")

In [33]:
df = read_in("cs-training.csv", "csv", has_index = True)
#get_description(df, "results/description.csv")
for attribute in df.columns:
    name = ATTRIBUTE_DICT[attribute]
    title = "Individuals by " + name
    column_data = df[attribute].dropna()
    val_counts = column_data.value_counts()
    axis = [column_data.min(), column_data.max(), 0, val_counts.iloc[0]*1.2]
    bins = column_data.nunique()
    histo_plot(column_data, name, "Number of Individuals", title, bins, axis)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
df = read_in("cs-mini.csv", "csv", has_index = True)

df["age"].range()

AttributeError: 'Series' object has no attribute 'range'

In [11]:
mu, sigma = 100, 15
x = mu + sigma*np.random.randn(10000)

# the histogram of the data
plt.hist(x, 100, normed=1)

# add a 'best fit' line
#y = mlab.normpdf( bins, mu, sigma)
#l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

plt.show()

<IPython.core.display.Javascript object>