In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.simplefilter('ignore', UserWarning)

In [1232]:
# read the data from ks-projects-201801.csv - 378,661 projects.
dataframe = pd.read_csv('ks-projects-201801.csv')

In [1233]:
# show information and data types of the data attributes.
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [None]:
# show first 5 rows from the data frame.
dataframe.head()

In [None]:
# projects state ratio
(dataframe['state'].value_counts()/len(dataframe))*100

In [None]:
# plot state distribution.
sns.catplot('state',data=dataframe,kind='count', height=6)

In [None]:
# num of launched projects per day of week 
dataframe.groupby([(pd.to_datetime(dataframe.launched).dt.strftime("%A"))]).size().plot(kind='bar',figsize=(10,5))

In [None]:
# total projects per year
dataframe.groupby([(pd.to_datetime(dataframe.launched).dt.year)]).size()

In [None]:
dataframe.groupby([(pd.to_datetime(dataframe.launched).dt.year)]).size().plot(figsize=(10,5))

In [None]:
# i will reduce datapoints to get more clean data
start_date = '2013-01-01'
dataframe = dataframe[dataframe['launched'] >= start_date]
dataframe.groupby([(pd.to_datetime(dataframe.launched).dt.year)]).size().plot(figsize=(10,5))

In [None]:
# average pledged amount in USD
round(dataframe['usd_pledged_real'].mean())

In [None]:
# average backers
int(dataframe['backers'].mean())

In [None]:
# average projects goal in usd
round(dataframe['usd_goal_real'].mean())

In [None]:
# heat map of average backers by country and main_category
pivot_table = dataframe.pivot_table(index='main_category', 
                   columns='country', 
                   values='backers', 
                   aggfunc='mean')
sns.heatmap(pivot_table)

In [None]:
# for the prediction purpose, i will filter the data
# to have only successful and failed projects.
dataframe = dataframe.loc[dataframe['state'].isin(['successful', 'failed'])]

In [None]:
# projects main category ratio
(dataframe['main_category'].value_counts()/len(dataframe))*100

In [None]:
# plot main category ratio distibution
(dataframe['main_category'].value_counts()/len(dataframe)).plot.pie(y='mass', figsize=(5, 5))

In [None]:
# count projects by country
dataframe['country'].value_counts()

In [None]:
# plot country distribution
sns.catplot('country',data=dataframe, order = dataframe['country'].value_counts().index,kind='count', height=6,)

In [None]:
# now country/state distribution.
sns.catplot(y="country", hue="state", kind="count",edgecolor=".6", data=dataframe, order = dataframe['country'].value_counts().index);

In [None]:
# checking which columns has null values
dataframe.isnull().sum()

In [None]:
# we can see that usd_pledged are the only column that has null values
# we will drop this column and so all columns that is known just after project is launched like
# pledged, backers, usd_pledged_real
# name, ID also are not influcing the machine learning process , so i will delete it as well.
dataframe = dataframe.drop(['ID', 'name', 'usd pledged', 'pledged', 'backers', 'usd_pledged_real'], axis=1)

In [None]:
# the dataframe now 
dataframe.head()

In [None]:
# encode string values to integers for the macheine learning purpose
countryTransformer = preprocessing.LabelEncoder()
currencyTransformer = preprocessing.LabelEncoder()
main_categoryTransformer = preprocessing.LabelEncoder()
categoryTransformer = preprocessing.LabelEncoder()
dataframe['country'] = countryTransformer.fit_transform(dataframe['country'])
dataframe['currency'] = currencyTransformer.fit_transform(dataframe['currency'])
dataframe['main_category'] = main_categoryTransformer.fit_transform(dataframe['main_category'])
dataframe['category'] = categoryTransformer.fit_transform(dataframe['category'])


In [None]:
# convert dates to Unix time in nano seconds
dataframe["launched"] = pd.to_datetime(dataframe["launched"])
dataframe["deadline"] = pd.to_datetime(dataframe["deadline"])
dataframe["deadline"] =  dataframe.deadline.values.astype(np.int64)
dataframe["launched"] =  dataframe.launched.values.astype(np.int64)

In [None]:
dataframe.head()

In [None]:
# X is the dataframe without the state column
X = dataframe.drop('state', axis=1)
# Y is the state column
Y = dataframe['state']

In [None]:
# here we are splitting the data into 80% and 20%
# 80% is for the model training X_train, Y_train
# 20% is for the model testing X_test, Y_test
# we will predict the state of X_test and compare it to the real data Y_test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# now let us check multiplue classification machine learning models


# prepare models
classifiers = [
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    GaussianNB()
]

# evaluate one by one to check who is the most accuracte 
for clf in classifiers:
    clf.fit(X_train, Y_train)
    
    print("="*30)
    print(clf.__class__.__name__)    
    
    prediction = clf.predict(X_test)
    acc = accuracy_score(Y_test, prediction)
    
    print("Accuracy: {:.2%}".format(acc))

print("="*30)
print('Finish')

In [None]:
# we can see that GradientBoostingClassifier is the most accurist model
# so lets perform fine tuning on the learning_rate attribute of the model
learning_rates = [1, 0.7, 0.5, 0.25, 0.1, 0.01]
for lrn in learning_rates:
    clf = GradientBoostingClassifier(learning_rate=lrn)
    clf.fit(X_train, Y_train)
    print("="*30)
    print(lrn)
    prediction = clf.predict(X_test)
    acc = accuracy_score(Y_test, prediction)
    print("Accuracy: {:.2%}".format(acc))
    
print("="*30)
print('Finish')

In [None]:
# we will use the best performance learning rate
# in fact, we can get more accuracy if we have more informative data features.
 
clf = GradientBoostingClassifier(learning_rate=1)
clf.fit(X_train, Y_train)
print('****Results****')
prediction = clf.predict(X_test)
acc = accuracy_score(Y_test, prediction)
print("Accuracy: {:.2%}".format(acc))

In [None]:
# comparing prediction to true data
np.column_stack((prediction,Y_test))

In [None]:
# let us see the feature importance order of the predition model.
feats = {}
for feature, importance in zip(X_train.columns, clf.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Feature-importance'})
importances.sort_values(by='Feature-importance').plot(kind='bar', rot=45)

In [None]:
# and now i will add a new feature based on the existing data
# dateDiff will be deadline - launched
#dataframe["project_length"] = round(abs((dataframe["deadline"] - dataframe["launched"]))/10**9/60/60/24)
dataframe["project_length"] = dataframe["deadline"] - dataframe["launched"]

In [None]:
dataframe.head()

In [None]:
# X is the dataframe without the state column
X = dataframe.drop('state', axis=1)
# Y is the state column
Y = dataframe['state']

In [None]:
# again split the data to 80% 20%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# here we are predicting with the new added feature
clf = GradientBoostingClassifier(learning_rate=1)
clf.fit(X_train, Y_train)
print('****Results****')
prediction = clf.predict(X_test)
acc = accuracy_score(Y_test, prediction)
print("Accuracy: {:.2%}".format(acc))

In [None]:
feats = {}
for feature, importance in zip(X_train.columns, clf.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Feature-importance'})
importances.sort_values(by='Feature-importance').plot(kind='bar', rot=45)

In [None]:
futureData = pd.DataFrame(columns=['category', 'main_category', 'currency', 'deadline', 'goal', 'launched', 'country', 'usd_goal_real'])
futureData.loc[0] = ['Restaurants', 'Food', 'USD', '2019-07-30', '60000.0', '2019-04-01 12:00:00','US', '60000.00']

In [None]:
futureData.head()

In [None]:
futureData['country'] = countryTransformer.transform(futureData['country'])
futureData['currency'] = currencyTransformer.transform(futureData['currency'])
futureData['main_category'] = main_categoryTransformer.transform(futureData['main_category'])
futureData['category'] = categoryTransformer.transform(futureData['category'])

In [None]:
futureData.head()

In [None]:
futureData["launched"] = pd.to_datetime(dataframe["launched"])
futureData["deadline"] = pd.to_datetime(dataframe["deadline"])
futureData["deadline"] =  futureData.deadline.values.astype(np.int64)
futureData["launched"] =  futureData.launched.values.astype(np.int64)

In [None]:
futureData["project_length"] = futureData["deadline"] - futureData["launched"]

In [None]:
prediction = clf.predict(futureData)

In [None]:
prediction