# Kickstarter Data Analysis

In [344]:
# Loading packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import random

## Data Cleaning

### Initial Look at Features

In [345]:
# Importing data
kickstarter2018 = pd.read_csv('archive/ks-projects-201801.csv')

In [346]:
# First look at dataset
kickstarter2018.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,10/9/2015,1000.0,8/11/2015 12:12,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,11/1/2017,30000.0,9/2/2017 4:43,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2/26/2013,45000.0,1/12/2013 0:20,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,4/16/2012,5000.0,3/17/2012 3:24,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,8/29/2015,19500.0,7/4/2015 8:35,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [347]:
# Summarizing the catagrical variables before cleaning
kickstarter2018.describe(include=['object'])

Unnamed: 0,name,category,main_category,currency,deadline,launched,state,country
count,378657,378661,378661,378661,378661,378661,378661,378661
unique,375722,159,15,14,3164,347035,6,23
top,#NAME?,Product Design,Film & Video,USD,8/8/2014,6/20/2017 16:00,failed,US
freq,43,22314,63585,295365,705,15,197719,292627


In [348]:
# Summarizing numerical variables before cleaning
kickstarter2018.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,378661.0,378661.0,378661.0,378661.0,374864.0,378661.0,378661.0
mean,1074731000.0,49080.79,9682.979,105.617476,7036.729,9058.924,45454.4
std,619086200.0,1183391.0,95636.01,907.185035,78639.75,90973.34,1152950.0
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,538263500.0,2000.0,30.0,2.0,16.98,31.0,2000.0
50%,1075276000.0,5200.0,620.0,12.0,394.72,624.33,5500.0
75%,1610149000.0,16000.0,4076.0,56.0,3034.09,4050.0,15500.0
max,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,20338990.0,166361400.0


### Dealing with Null Values

Since we will be dropping the name and usd pledged variables, we do not need to deal with anu null values. 

In [349]:
# Counting missing values in each column
kickstarter2018.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

### Dealing With Date Strings

In [350]:
def convertDate(row):
    return(datetime.strptime(row, "%m/%d/%Y").date())

def convertDateTime(row):
    return(datetime.strptime(row, "%m/%d/%Y %H:%M").date())

def computeDuration(row):
    return (row['deadline'] - row['launched']).days

In [351]:
# Converting all date strings to datetime objects and computing the number of days it was live
kickstarter2018['deadline'] = kickstarter2018['deadline'].apply(convertDate)
kickstarter2018['launched'] = kickstarter2018['launched'].apply(convertDateTime)
kickstarter2018['duration'] = kickstarter2018[['deadline', 'launched']].apply(computeDuration, axis = 1)

In [352]:
# Removing unnecessary columns from dataset
kickstarter2018 = kickstarter2018[[
                                    'main_category',  
                                    'state', 
                                    'backers', 
                                    'country', 
                                    'usd_pledged_real', 
                                    'usd_goal_real',
                                    'duration'
                                 ]]

# Renaming columns for readability
kickstarter2018.rename(columns = {'usd_pledged_real' : 'pledged', 'usd_goal_real' : 'goal', 'main_category' : 'category'}, inplace = True)

### Changing State to Binary Variable

In [353]:
kickstarter2018['state'].unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [354]:
# Converting suspended and canceled observations to failed
kickstarter2018['state'] = kickstarter2018['state'].replace({'suspended' : 'failed', 'canceled' : 'failed'}) 

In [355]:
# Dropping observations with live and undefined states
kickstarter2018 = kickstarter2018[(kickstarter2018['state'] != 'live') & (kickstarter2018['state'] != 'undefined')]

In [356]:
kickstarter2018['state'].unique()

array(['failed', 'successful'], dtype=object)

In [357]:
kickstarter2018.shape

(372300, 7)

### Removing Outliers

In [358]:
from scipy.stats import iqr

In [359]:
IQR = []
def removeOutliers(df, columnName):
    IQR = iqr(df[columnName])
    Q1 = np.quantile(df[columnName],0.25)
    Q3 = np.quantile(df[columnName],0.75)
    return(df[(df[columnName] > (Q1 - 1.5*IQR)) & (df[columnName] < (Q3 + 1.5*IQR))])

In [360]:
df = removeOutliers(kickstarter2018, 'backers')
df = removeOutliers(df, 'pledged')
df = removeOutliers(df, 'goal')
df = removeOutliers(df, 'duration')
kickstarter2018 = df

In [361]:
df.describe()

Unnamed: 0,backers,pledged,goal,duration
count,173932.0,173932.0,173932.0,173932.0
mean,18.338069,982.800392,6370.21588,30.702148
std,25.34473,1380.473852,7073.961279,3.117555
min,0.0,0.0,0.01,23.0
25%,1.0,15.0075,1500.0,30.0
50%,7.0,265.735,3500.0,30.0
75%,26.0,1490.6475,9000.0,30.0
max,139.0,5482.0,30720.08,42.0


### Summarizing Features after Cleaning

In [362]:
# Summarizing the catagrical variables after cleaning
kickstarter2018.describe(include=['object'])

Unnamed: 0,category,state,country
count,173932,173932,173932
unique,15,2,23
top,Film & Video,failed,US
freq,25788,129131,134280


In [363]:
# Summarizing numerical variables after cleaning
kickstarter2018.describe()

Unnamed: 0,backers,pledged,goal,duration
count,173932.0,173932.0,173932.0,173932.0
mean,18.338069,982.800392,6370.21588,30.702148
std,25.34473,1380.473852,7073.961279,3.117555
min,0.0,0.0,0.01,23.0
25%,1.0,15.0075,1500.0,30.0
50%,7.0,265.735,3500.0,30.0
75%,26.0,1490.6475,9000.0,30.0
max,139.0,5482.0,30720.08,42.0


In [364]:
# Resetting index after removing observations
kickstarter2018.reset_index(drop = True, inplace = True)

### Exploratory Analysis and Data Visualization

this is text about what im doing

In [264]:
kickstarter2018.columns

Index(['category', 'state', 'backers', 'country', 'pledged', 'goal',
       'duration'],
      dtype='object')

In [265]:
kickstarter2018['country'].unique()

array(['US', 'CA', 'AU', 'NO', 'GB', 'IT', 'IE', 'MX', 'SE', 'FR', 'DE',
       'NL', 'CH', 'AT', 'DK', 'BE', 'ES', 'NZ', 'HK', 'N,0"', 'SG', 'JP',
       'LU'], dtype=object)

In [267]:
percentOfGoal = kickstarter2018['pledged'] / kickstarter2018['goal']

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (20, 100))
#ax.set_xscale("log")

plot = sns.violinplot(
    ax = ax,
    data = kickstarter2018,
    x = percentOfGoal, 
    y = 'category',
    hue = 'state')

plot.set_title('Category vs. Percent of Goal Pledged')
plot.set_xlabel('Percent of Goal Pledged')

### Feature Scaling

Feature scaling is important for models that use Euclidian distances as well as for visualization and decreases training time. 

In [365]:
# Scaling the numerical data
from sklearn.preprocessing import MinMaxScaler

In [366]:
ss = MinMaxScaler()

# Subsetting numerical data
kickstarter2018_numerical = kickstarter2018[['backers', 'pledged', 'goal', 'duration']]

# Fitiing scaler to numerical data
scaler = ss.fit(kickstarter2018_numerical.values)

# Transforming numerical data with the scaler and placing back into dataset
kickstarter2018_numerical_scaled = scaler.transform(kickstarter2018_numerical.values)
kickstarter2018[['backers', 'pledged', 'goal', 'duration']] = kickstarter2018_numerical_scaled
kickstarter2018.describe()

Unnamed: 0,backers,pledged,goal,duration
count,173932.0,173932.0,173932.0,173932.0
mean,0.131929,0.179278,0.207363,0.405376
std,0.182336,0.251819,0.230272,0.164082
min,0.0,0.0,0.0,0.0
25%,0.007194,0.002738,0.048828,0.368421
50%,0.05036,0.048474,0.113932,0.368421
75%,0.18705,0.271917,0.292968,0.368421
max,1.0,1.0,1.0,1.0


### One Hot Encoding Categorical Data

In [367]:
from sklearn.preprocessing import OneHotEncoder

In [368]:
# Subsetting categorical data
kickstarter_categorical = kickstarter2018[['category', 'country']]

In [369]:
kickstarter2018['category'].unique()

array(['Music', 'Publishing', 'Crafts', 'Design', 'Comics', 'Food',
       'Fashion', 'Theater', 'Art', 'Film & Video', 'Games',
       'Photography', 'Technology', 'Journalism', 'Dance'], dtype=object)

In [370]:
kickstarter2018['country'].unique()

array(['US', 'CA', 'AU', 'NO', 'GB', 'IT', 'IE', 'MX', 'SE', 'FR', 'DE',
       'NL', 'CH', 'AT', 'DK', 'BE', 'ES', 'NZ', 'HK', 'N,0"', 'SG', 'JP',
       'LU'], dtype=object)

In [371]:
encoder = OneHotEncoder(sparse = False)
encoded = encoder.fit_transform(kickstarter_categorical)
encoded_df = pd.DataFrame(encoded)

In [372]:
kickstarter2018 = kickstarter2018.join(encoded_df)
kickstarter2018.drop(['category', 'country'], axis = 1, inplace = True)

In [373]:
kickstarter2018

Unnamed: 0,state,backers,pledged,goal,duration,0,1,2,3,4,...,28,29,30,31,32,33,34,35,36,37
0,failed,0.007194,0.000182,0.162760,0.368421,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,failed,0.000000,0.000000,0.078333,0.368421,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,failed,0.000000,0.000000,0.162760,0.368421,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,failed,0.079137,0.121124,0.081380,0.368421,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,failed,0.115108,0.072054,0.048828,0.368421,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173927,failed,0.007194,0.004560,0.055338,0.368421,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
173928,failed,0.028777,0.028092,0.211588,0.368421,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
173929,failed,0.035971,0.028274,0.048828,0.210526,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
173930,failed,0.043165,0.036483,0.488280,0.421053,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Model Selection

In [374]:
# Creating x and y data frames
X = kickstarter2018.drop('state', axis = 1)
Y = kickstarter2018['state']

In [384]:
# Splitting training and test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size = 80000, test_size = 20000)

In [385]:
from sklearn import svm
svm_model = svm.SVC()

In [386]:
svm_model.fit(x_train, y_train)

SVC()

In [387]:
svm_model.score(x_test, y_test)

0.9756

NameError: name 'SVC' is not defined