
# Getting started

Once you've chosen your scenario, download the data from the Iowa website in csv format. Start by loading the data with pandas. You may need to parse the date columns appropriately.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math

## Load the data into a DataFrame
df = pd.read_csv('../../../../../Datasets/Iowa_Liquor_Sales_reduced.csv')

df.head()

## Transform the dates if needed, e.g.
# df["Date"] = pd.to_datetime(df["Date"], format="%m-%d-%y")

In [None]:
df.dtypes

## Eradicate Dollar Signs

In [None]:
df['State Bottle Cost'] = df['State Bottle Cost'].str.replace("$", "")
df['State Bottle Retail'] = df['State Bottle Retail'].str.replace("$", "")
df['Sale (Dollars)'] = df['Sale (Dollars)'].str.replace("$", "")
df.head()

## Convert Necessary Columns to Floats

In [None]:
len(df_by_county)

In [None]:
pop_est.reset_index(inplace=True)

In [None]:
pop_est.drop('index', axis=1, inplace=True)
pop_est.head()

In [None]:
pop_est.set_index('County', inplace=True)
pop_est.head()

## Population per Store based on Current Population

In [None]:
df_by_county['Pop Store Ratio Current'] = pop_est['2015 Population'] / df_by_county['# of Stores']
df_by_county.head()

## Population per Store based on 2020 Pop Estimate

In [None]:
df_by_county['Pop Store Ratio 2020'] = pop_est['2020 Population'] / df_by_county['# of Stores']
df_by_county.head()

In [None]:
combined_df = pd.concat([df_by_county, pop_est], axis=1)
combined_df.head()

## Saturation Level - Population to Store Ratio

In [None]:
combined_df['Pop Store Ratio Current'].sort_values(ascending=False)

## Sorted by Profit per Store Grouped by County

In [None]:
df['State Bottle Cost'] = df['State Bottle Cost'].astype('float')
df['State Bottle Retail'] = df['State Bottle Retail'].astype('float')
df['Sale (Dollars)'] = df['Sale (Dollars)'].astype('float')
df.dtypes

## Initial Data Length

In [None]:
len(df)

## Null Values by Column

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

## New Length of df after Dropping

In [None]:
df['County'].value_counts()

## Created Column 'ml Sold'

In [None]:
df['ml Sold'] = df['Bottle Volume (ml)'] * df['Bottles Sold']
df.head()

## Create a Profit Column ((retail - cost) * bottles sold)

In [None]:
sns.pairplot(combined_df[['Profit Per Store','Pop Store Ratio Current','Pop Store Ratio 2020','Consumption Per Person','Weeks Rec Unemployment Benefits','Residential Value']])

In [None]:
combined_df.corr()

In [None]:
combined_df.dtypes

# Build your models

Using scikit-learn or statsmodels, build the necessary models for your scenario. Evaluate model fit.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model

## Model with all Values

In [None]:
X = combined_df[['Pop Store Ratio Current','Consumption Per Person', 'Residential Value']]
X_future = combined_df[['Pop Store Ratio 2020','Consumption Per Person', 'Residential Value']]
y = combined_df['Profit Per Store']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=99)

reg = LinearRegression()
model = reg.fit(X_train,y_train)
#cross_val_score(model, X_train, y_train, cv=3).mean()

## Train Score All Values

In [None]:
model.score(X_train,y_train)

## Test Score All Values

In [None]:
model.score(X_test,y_test)

## Profit Per Store Predictions by County

In [None]:
combined_df.head()

## Unemployment Data

In [None]:
fields = ['Year', 'County Name', 'Benefits Paid', 'Weeks Compensated']
unemployment = pd.read_csv('../../../../../Datasets/Unemployment_Insurance_Recipients_and_UI_Benefit_Payments_by_County__Annual_.csv', usecols=fields)
unemployment.head()

In [None]:
unemployment.reset_index(inplace=True)

In [None]:
unemployment.drop('index', axis=1, inplace=True)

In [None]:
unemployment = unemployment[unemployment.Year == 2015]
unemployment.head()

In [None]:
unemployment.set_index('County Name', inplace=True)

In [None]:
combined_df["Weeks Rec Unemployment Benefits"] = (unemployment['Weeks Compensated']) / (combined_df['2015 Population'])

In [None]:
combined_df.isnull()

In [None]:
combined_df.fillna(value=0, inplace=True)

## Outlier Adjusted DataFrame

In [None]:
outlier_adjusted = combined_df.drop(['Davis','Fremont'], axis=0)

## Correlation Between Independent Variables

In [None]:
import seaborn as sns

In [None]:
pop_est.drop([1,2,3,7,8,9], axis=1, inplace=True)

In [None]:
pop_est.drop([109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124], inplace=True)

In [None]:
print pop_est.head()
print pop_est.tail()

In [None]:
names= ['County', '2015 Population','2020 Population','2025 Population']
pop_est.columns = names
print pop_est.head()

In [None]:
len(pop_est)

In [None]:
combined_df['Profit Per Store'].sort_values(ascending=False)

## Consumption per Person

In [None]:
combined_df['Consumption Per Person'] = combined_df['ml Sold'] / combined_df["2015 Population"]

# Housing Data

In [None]:
fields = ['County Name', 'Residential', 'Commercial']
housing = pd.read_csv('../../../../../Datasets/Assessed_Property_Values_By_Tax_District_and_Year.csv', usecols=fields)
housing.head()

In [None]:
housing['Residential'] = housing.Residential.str.replace("$","").astype(float)
housing['Commercial'] = housing.Commercial.str.replace("$","").astype(float)

In [None]:
housing['County Name'] = housing['County Name'].str.title()

In [None]:
housing = housing.groupby('County Name').mean()

In [None]:
housing.head()

In [None]:
combined_df['Residential Value'] = housing.Residential
combined_df['Commercial Value'] = housing.Commercial

In [None]:
print(housing.Residential / 10000).sort_values(ascending=False)

In [None]:
df['Profit'] = (df['State Bottle Retail'] - df['State Bottle Cost']) * df["Bottles Sold"]
df.head()

In [None]:
df.describe()

# Explore the data

Perform some exploratory statistical analysis and make some plots, such as histograms of transaction totals, bottles sold, etc.

## Reformat the Dates

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

## Sorting df by Date

In [None]:
y_pred_2015 = model.predict(X)

In [None]:
y_pred_2020 = model.predict(X_future)

In [None]:
combined_df['y_pred_2015'] = y_pred_2015
combined_df['y_pred_2020'] = y_pred_2020

## Model Removing Outliers

In [None]:
df.sort_values('Date').head()

In [None]:
df.sort_values('Date').tail()

# Record your findings

Be sure to write out any observations from your exploratory analysis.

Multiple columns had a dollar sign which needed to be removed in order to change the column to a float.

The Date column is not proper format so I used to_datetime to change the format

There are missing values for the following columns: County, County Number, Category, Category Name.

The data starts in January of 2015 and ends in March of 2016

# Mine the data

Now you are ready to compute the variables you will use for your regression from the data. For example, you may want to compute total sales per store from Jan to March of 2015, mean price per bottle, etc. Refer to the readme for more ideas appropriate to your scenario.

Pandas is your friend for this task. Take a look at the operations here for ideas on how to make the best use of pandas and feel free to search for blog and Stack Overflow posts to help you group data by certain variables and compute sums, means, etc. You may find it useful to create a new data frame to house this summary data.

 


## Sum Columns by County

In [None]:
df_by_county = df.groupby(['County']).sum()
df_by_county.head()

## drop irrelevant columns: store number

## Number of Stores in each County

In [None]:
df_by_county['# of Stores'] = df.groupby(['County'])['Store Number'].nunique()

#### concat a column of number of stores in a county by using store number unique

#### groupby county -- store number unique.sum()

## Profit Per Store

In [None]:
df_by_county['Profit Per Store'] = df_by_county['Profit'] / df_by_county['# of Stores']
df_by_county.head()

# Refine the data

Look for any statistical relationships, correlations, or other relevant properties of the dataset.

## Read in Population Estimates

In [None]:
pop_est = pd.read_excel('pop_est.xls', header=None)
##reassign column header
## drop rows
pop_est.drop([0,1,2,3,4,5,6,7,8,9], inplace=True)
pop_est.head(15)