# Importing libraries

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

# Load data

In [3]:
kickstarter_df = pd.read_csv("./inputs/kickstarter_data_full.csv", low_memory=False)

# Drop unnecessary columns

In [4]:
kickstarter_df.drop(columns=["index", "Unnamed: 0"], inplace=True)

# Naive feature selection

In [5]:
kickstarter_df.drop(columns="id", inplace=True)

In [6]:
kickstarter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 66 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   photo                        20632 non-null  object 
 1   name                         20632 non-null  object 
 2   blurb                        20626 non-null  object 
 3   goal                         20632 non-null  float64
 4   pledged                      20632 non-null  float64
 5   state                        20632 non-null  object 
 6   slug                         20632 non-null  object 
 7   disable_communication        20632 non-null  bool   
 8   country                      20632 non-null  object 
 9   currency                     20632 non-null  object 
 10  currency_symbol              20632 non-null  object 
 11  currency_trailing_code       20632 non-null  bool   
 12  deadline                     20632 non-null  object 
 13  state_changed_at

# Remove features with more that 50% missing values

In [7]:
kickstarter_df.drop(
    columns=["friends", "is_starred", "is_backing", "permissions"], inplace=True
)

# Remove features with zero variance

In [8]:
kickstarter_df.nunique()[kickstarter_df.nunique() == 1]

Series([], dtype: int64)

# Check rows with missing values

In [9]:
kickstarter_df.isnull().any(axis=1).sum()

1937

In [60]:
kickstarter_df[
    kickstarter_df[
        [
            "goal",
            "name_len_clean",
            "blurb_len_clean",
            "backers_count",
            "create_to_launch",
            "launch_to_deadline",
            "launch_to_state_change",
            "pledged",
            "usd_pledged",
            "static_usd_rate",
        ]
    ]
    .isnull()
    .any(axis=1)
]

Unnamed: 0,photo,name,blurb,goal,pledged,state,slug,disable_communication,country,currency,...,launch_to_deadline,launch_to_state_change,create_to_launch_days,launch_to_deadline_days,launch_to_state_change_days,SuccessfulBool,USorGB,TOPCOUNTRY,LaunchedTuesday,DeadlineWeekend
1411,"{""small"":""https://ksr-ugc.imgix.net/assets/012...",N/A (Canceled),,1500000.0,0.0,canceled,long-island-school-auditorium,False,US,USD,...,30 days 00:00:00.000000000,0 days 05:45:12.000000000,0,30,0,0,1,1,0,0
6744,"{""small"":""https://ksr-ugc.imgix.net/assets/014...",N/A (Canceled),,30000.0,619.0,canceled,teamstar-sports-community-development,False,AU,AUD,...,30 days 00:00:00.000000000,15 days 03:11:27.000000000,34,30,15,0,0,0,0,0
9239,"{""small"":""https://ksr-ugc.imgix.net/assets/012...",Star Wars Bluetooth Speakers (Canceled),,60000.0,36058.0,canceled,star-wars-bluetooth-speakers,False,GB,GBP,...,30 days 00:00:00.000000000,2 days 09:38:30.000000000,90,30,2,0,1,1,0,0
11708,"{""small"":""https://ksr-ugc.imgix.net/assets/011...",OF Press - A WordPress Theme and Site Builder ...,,5000.0,71.0,canceled,of-press,False,US,USD,...,30 days 00:00:00.000000000,19 days 01:12:46.000000000,0,30,19,0,1,1,0,1
14805,"{""small"":""https://ksr-ugc.imgix.net/assets/011...",TEST (Canceled),,1000001.0,31.0,canceled,caiman-connected-the-ultimate-mobile-device-ac...,False,US,USD,...,14 days 00:00:00.000000000,12 days 20:37:21.000000000,6,14,12,0,1,1,0,0


In [61]:
# drop null on ["goal","name_len_clean","blurb_len_clean","backers_count","create_to_launch","launch_to_deadline","launch_to_state_change","pledged","usd_pledged","static_usd_rate"]
kickstarter_df.dropna(
    subset=[
        "goal",
        "name_len_clean",
        "blurb_len_clean",
        "backers_count",
        "create_to_launch",
        "launch_to_deadline",
        "launch_to_state_change",
        "pledged",
        "usd_pledged",
        "static_usd_rate",
    ],
    inplace=True,
)

# Check the cases of the "state" (dependent) variable

In [62]:
kickstarter_df["state"].value_counts()

state
failed        11416
successful     6018
canceled       2455
live            508
suspended       230
Name: count, dtype: int64

In [63]:
kickstarter_df.describe().astype("Float64")

Unnamed: 0,goal,pledged,backers_count,static_usd_rate,usd_pledged,name_len,name_len_clean,blurb_len,blurb_len_clean,deadline_month,...,launched_at_yr,launched_at_hr,create_to_launch_days,launch_to_deadline_days,launch_to_state_change_days,SuccessfulBool,USorGB,TOPCOUNTRY,LaunchedTuesday,DeadlineWeekend
count,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,...,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0,20627.0
mean,94001.970367,21396.078288,183.708053,1.03936,20918.300266,5.940806,5.292578,18.991177,13.081204,6.707471,...,2014.752073,12.416881,49.583313,34.718815,31.174625,0.291754,0.806467,0.816212,0.22519,0.292044
std,1335622.241364,120511.450525,1222.158106,0.230413,115485.126394,2.826118,2.418168,4.632371,3.283547,3.411277,...,1.261851,5.573344,111.106169,11.873524,14.277022,0.45458,0.395076,0.387321,0.417718,0.454714
min,1.0,0.0,0.0,0.045641,0.0,1.0,1.0,1.0,1.0,1.0,...,2009.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4000.0,25.0,2.0,1.0,25.0,4.0,3.0,17.0,11.0,4.0,...,2014.0,9.0,3.0,30.0,28.0,0.0,1.0,1.0,0.0,0.0
50%,14000.0,697.0,12.0,1.0,717.0,6.0,5.0,20.0,13.0,7.0,...,2015.0,12.0,14.0,30.0,30.0,0.0,1.0,1.0,0.0,0.0
75%,50000.0,5954.5,63.0,1.0,6004.752118,8.0,7.0,22.0,15.0,10.0,...,2016.0,16.0,45.0,40.0,35.0,1.0,1.0,1.0,0.0,1.0
max,100000000.0,6225354.98,105857.0,1.715913,6225354.98,16.0,14.0,35.0,30.0,12.0,...,2017.0,23.0,1754.0,91.0,91.0,1.0,1.0,1.0,1.0,1.0


# Encode the "state" variable

In [64]:
from sklearn.preprocessing import LabelEncoder

In [65]:
label_encoder = LabelEncoder()

In [66]:
kickstarter_df_encoded = kickstarter_df.copy()

In [67]:
kickstarter_df_encoded["state"] = label_encoder.fit_transform(
    kickstarter_df_encoded["state"]
)

In [68]:
label_encoder.classes_

array(['canceled', 'failed', 'live', 'successful', 'suspended'],
      dtype=object)

In [69]:
label_encoder.inverse_transform([0, 1, 2, 3, 4])

array(['canceled', 'failed', 'live', 'successful', 'suspended'],
      dtype=object)

In [70]:
kickstarter_df_encoded["state"].value_counts()

state
1    11416
3     6018
0     2455
2      508
4      230
Name: count, dtype: int64

In [71]:
kickstarter_df_encoded.columns

Index(['photo', 'name', 'blurb', 'goal', 'pledged', 'state', 'slug',
       'disable_communication', 'country', 'currency', 'currency_symbol',
       'currency_trailing_code', 'deadline', 'state_changed_at', 'created_at',
       'launched_at', 'staff_pick', 'backers_count', 'static_usd_rate',
       'usd_pledged', 'creator', 'location', 'category', 'profile',
       'spotlight', 'urls', 'source_url', 'name_len', 'name_len_clean',
       'blurb_len', 'blurb_len_clean', 'deadline_weekday',
       'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday',
       'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr',
       'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr',
       'state_changed_at_hr', 'created_at_month', 'created_at_day',
       'created_at_yr', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch', 'launch_to_deadline', 'launch_to_state_change',
       'c

In [72]:
kickstarter_df_encoded[['create_to_launch', 'launch_to_deadline', 'launch_to_state_change']]

Unnamed: 0,create_to_launch,launch_to_deadline,launch_to_state_change
0,17 days 14:51:39.000000000,36 days 20:47:24.000000000,36 days 20:47:24.000000000
1,10 days 06:44:39.000000000,60 days 00:00:00.000000000,60 days 00:00:02.000000000
2,1 days 08:08:58.000000000,60 days 00:00:00.000000000,60 days 00:00:01.000000000
3,0 days 02:11:17.000000000,30 days 00:00:00.000000000,30 days 00:00:00.000000000
4,0 days 15:47:38.000000000,32 days 06:02:33.000000000,32 days 06:02:33.000000000
...,...,...,...
20627,5 days 13:26:50.000000000,31 days 23:47:41.000000000,31 days 23:47:42.000000000
20628,56 days 02:24:44.000000000,30 days 00:00:00.000000000,30 days 00:00:01.000000000
20629,0 days 23:56:32.000000000,60 days 00:00:00.000000000,60 days 00:00:04.000000000
20630,1 days 17:10:17.000000000,30 days 00:00:00.000000000,30 days 00:00:00.000000000


# Hyphotezis 5: Is features like backers count assotiate with success?

In [73]:
h7_df = kickstarter_df_encoded[
    [
        "goal",
        "name_len_clean",
        "blurb_len_clean",
        "backers_count",
        "create_to_launch",
        "launch_to_deadline",
        "launch_to_state_change",
        "pledged",
        "usd_pledged",
        "static_usd_rate",
    ]
]

In [77]:
def convert_to_days(row): # format now 36 days 20:47:24.000000000
    days = row.split(" ")[0]
    return int(days)

In [78]:
h7_df["create_to_launch"] = h7_df["create_to_launch"].apply(convert_to_days)
h7_df["launch_to_deadline"] = h7_df["launch_to_deadline"].apply(convert_to_days)
h7_df["launch_to_state_change"] = h7_df["launch_to_state_change"].apply(convert_to_days)

In [79]:
h7_df

Unnamed: 0,goal,name_len_clean,blurb_len_clean,backers_count,create_to_launch,launch_to_deadline,launch_to_state_change,pledged,usd_pledged,static_usd_rate
0,1500.0,9.0,16.0,0,17,36,36,0.0,0.000000,1.000000
1,500.0,4.0,15.0,0,10,60,60,0.0,0.000000,1.000000
2,100000.0,8.0,10.0,5,1,60,60,120.0,120.000000,1.000000
3,5000.0,6.0,13.0,0,0,30,30,0.0,0.000000,1.000000
4,3222.0,7.0,18.0,17,0,32,32,356.0,396.802395,1.114613
...,...,...,...,...,...,...,...,...,...,...
20627,32500.0,5.0,16.0,173,5,31,31,25868.0,25886.197879,1.000703
20628,100000.0,2.0,15.0,13,56,30,30,1559.0,1644.428040,1.054797
20629,10000.0,3.0,17.0,0,0,60,60,0.0,0.000000,1.000000
20630,2500.0,1.0,6.0,0,1,30,30,0.0,0.000000,1.000000


In [80]:
def covert_to_usd(row, col_name):
    return (
        row[col_name] * row["static_usd_rate"]
        if row["static_usd_rate"] != 1
        else row[col_name]
    )

In [81]:
h7_df["usd_goal"] = h7_df.apply(covert_to_usd, args=("goal",), axis=1)
h7_df["usd_pledged"] = h7_df.apply(covert_to_usd, args=("pledged",), axis=1)

In [82]:
h7_df.drop(columns=["goal", "pledged", "static_usd_rate"], inplace=True)

In [83]:
h7_df

Unnamed: 0,name_len_clean,blurb_len_clean,backers_count,create_to_launch,launch_to_deadline,launch_to_state_change,usd_pledged,usd_goal
0,9.0,16.0,0,17,36,36,0.000000,1500.000000
1,4.0,15.0,0,10,60,60,0.000000,500.000000
2,8.0,10.0,5,1,60,60,120.000000,100000.000000
3,6.0,13.0,0,0,30,30,0.000000,5000.000000
4,7.0,18.0,17,0,32,32,396.802395,3591.284600
...,...,...,...,...,...,...,...,...
20627,5.0,16.0,173,5,31,31,25886.197879,32522.863425
20628,2.0,15.0,13,56,30,30,1644.428040,105479.669000
20629,3.0,17.0,0,0,60,60,0.000000,10000.000000
20630,1.0,6.0,0,1,30,30,0.000000,2500.000000


In [84]:
X = h7_df.drop(columns="usd_pledged")
y = h7_df["usd_pledged"]

In [85]:
corr_df = h7_df.corr()

In [86]:
import plotly.graph_objects as go

In [89]:
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z=corr_df.values,
        x=corr_df.columns,
        y=corr_df.columns,
        zmin=-1,
        zmax=1,
    )
)
fig.show()

In [96]:
def remove_outliers_backers_count(df, col_name):
    q1 = df[col_name].quantile(0.25)
    q3 = df[col_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return df[(df[col_name] > lower_bound) & (df[col_name] < upper_bound)]

In [97]:
h7_df = remove_outliers_backers_count(h7_df, "backers_count")

In [98]:
h7_df

Unnamed: 0,name_len_clean,blurb_len_clean,backers_count,create_to_launch,launch_to_deadline,launch_to_state_change,usd_pledged,usd_goal
0,9.0,16.0,0,17,36,36,0.000000,1500.00000
1,4.0,15.0,0,10,60,60,0.000000,500.00000
2,8.0,10.0,5,1,60,60,120.000000,100000.00000
3,6.0,13.0,0,0,30,30,0.000000,5000.00000
4,7.0,18.0,17,0,32,32,396.802395,3591.28460
...,...,...,...,...,...,...,...,...
20626,7.0,16.0,9,2,35,35,72.630572,3595.57285
20628,2.0,15.0,13,56,30,30,1644.428040,105479.66900
20629,3.0,17.0,0,0,60,60,0.000000,10000.00000
20630,1.0,6.0,0,1,30,30,0.000000,2500.00000


# Split to X and y

In [99]:
X = h7_df[["backers_count"]]
y = h7_df["usd_pledged"]

In [116]:
# normalize data
from sklearn.preprocessing import MinMaxScaler

In [117]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))

In [118]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [119]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [120]:
lr.fit(X_train, y_train)

In [121]:
print("y = {:.2f} + {:.2f} * x1".format(lr.intercept_[0], lr.coef_[0][0]))

y = -0.00 + 0.09 * x1


In [123]:
# plot the linear regression line
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=X_train[:, 0],
        y=y_train[:, 0],
        mode="markers",
        name="train",
    )   
)
fig.add_trace(
    go.Scatter(
        x=X_test[:, 0],
        y=y_test[:, 0],
        mode="markers",
        name="test",
    )
)  
fig.add_trace(
    go.Scatter(
        x=X_train[:, 0],
        y=lr.predict(X_train),
        mode="lines",
        name="train prediction",
    )
)
fig.add_trace(
    go.Scatter(
        x=X_test[:, 0],
        y=lr.predict(X_test),
        mode="lines",
        name="test prediction",
    )   
)
fig.show()

In [125]:
from sklearn.metrics import mean_squared_error, r2_score
print("Mean squared error: {:.2f}".format(mean_squared_error(y_test, lr.predict(X_test))))

Mean squared error: 0.00


In [126]:
print("Coefficient of determination: {:.2f}".format(r2_score(y_test, lr.predict(X_test))))

Coefficient of determination: 0.29
