# Importing libraries


In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go


# Load data


In [9]:
kickstarter_df = pd.read_csv("./data/kickstarter_data_full.csv", low_memory=False)


# Drop unnecessary columns


In [10]:
kickstarter_df.drop(columns=["index", "Unnamed: 0", "id"], inplace=True)


# Naive feature selection


## Remove features with more that 50% missing values


In [11]:
kickstarter_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 66 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   photo                        20632 non-null  object 
 1   name                         20632 non-null  object 
 2   blurb                        20626 non-null  object 
 3   goal                         20632 non-null  float64
 4   pledged                      20632 non-null  float64
 5   state                        20632 non-null  object 
 6   slug                         20632 non-null  object 
 7   disable_communication        20632 non-null  bool   
 8   country                      20632 non-null  object 
 9   currency                     20632 non-null  object 
 10  currency_symbol              20632 non-null  object 
 11  currency_trailing_code       20632 non-null  bool   
 12  deadline                     20632 non-null  object 
 13  state_changed_at

In [13]:
kickstarter_df.drop(
    columns=["friends", "is_starred", "is_backing", "permissions"], inplace=True
)


## Check features with zero variance


In [14]:
kickstarter_df.nunique()[kickstarter_df.nunique() == 1]


Series([], dtype: int64)

## Check rows with missing values


In [15]:
kickstarter_df.isnull().any(axis=1).sum()


1937

In [17]:
kickstarter_df[kickstarter_df[["backers_count", "usd_pledged"]].isnull().any(axis=1)]


Unnamed: 0,photo,name,blurb,goal,pledged,state,slug,disable_communication,country,currency,...,launch_to_deadline,launch_to_state_change,create_to_launch_days,launch_to_deadline_days,launch_to_state_change_days,SuccessfulBool,USorGB,TOPCOUNTRY,LaunchedTuesday,DeadlineWeekend


# Check the cases of the "SuccessfulBool" (dependent) variable


In [18]:
kickstarter_df["SuccessfulBool"].value_counts()


SuccessfulBool
0    14614
1     6018
Name: count, dtype: int64

In [19]:
kickstarter_df.describe().astype("Float64")


Unnamed: 0,goal,pledged,backers_count,static_usd_rate,usd_pledged,name_len,name_len_clean,blurb_len,blurb_len_clean,deadline_month,...,launched_at_yr,launched_at_hr,create_to_launch_days,launch_to_deadline_days,launch_to_state_change_days,SuccessfulBool,USorGB,TOPCOUNTRY,LaunchedTuesday,DeadlineWeekend
count,20632.0,20632.0,20632.0,20632.0,20632.0,20627.0,20627.0,20627.0,20627.0,20632.0,...,20632.0,20632.0,20632.0,20632.0,20632.0,20632.0,20632.0,20632.0,20632.0,20632.0
mean,94104.965285,21392.675739,183.675843,1.039363,20915.907911,5.940806,5.292578,18.991177,13.081204,6.707784,...,2014.752084,12.417168,49.577598,34.716896,31.169397,0.291683,0.806466,0.816208,0.225136,0.292022
std,1335511.390003,120497.251802,1222.012658,0.230419,115471.73309,2.826118,2.418168,4.632371,3.283547,3.41142,...,1.261752,5.574409,111.094601,11.873143,14.279705,0.454548,0.395078,0.387324,0.417682,0.454703
min,1.0,0.0,0.0,0.045641,0.0,1.0,1.0,1.0,1.0,1.0,...,2009.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4000.0,25.0,2.0,1.0,25.0,4.0,3.0,17.0,11.0,4.0,...,2014.0,9.0,3.0,30.0,28.0,0.0,1.0,1.0,0.0,0.0
50%,14000.0,695.0,12.0,1.0,716.301193,6.0,5.0,20.0,13.0,7.0,...,2015.0,12.0,14.0,30.0,30.0,0.0,1.0,1.0,0.0,0.0
75%,50000.0,5954.25,63.0,1.0,6004.628177,8.0,7.0,22.0,15.0,10.0,...,2016.0,16.0,45.0,40.0,35.0,1.0,1.0,1.0,0.0,1.0
max,100000000.0,6225354.98,105857.0,1.715913,6225354.98,16.0,14.0,35.0,30.0,12.0,...,2017.0,23.0,1754.0,91.0,91.0,1.0,1.0,1.0,1.0,1.0


# Hyphotezis : Is features like backers count assotiate with success?


# Hypothesis 6: : The number of backers have a decreasing positive relationship with the number of money pledged.


In [26]:
h6_df = kickstarter_df[["backers_count", "usd_pledged"]]


### square the backers_count column to check for the decrease influence of the relationship between money pledged and backers count


In [27]:
h6_df["backers_count_squared"] = h6_df["backers_count"] ** 2


In [28]:
h6_df


Unnamed: 0,backers_count,usd_pledged,backers_count_squared
0,0,0.000000,0
1,0,0.000000,0
2,5,120.000000,25
3,0,0.000000,0
4,17,396.802395,289
...,...,...,...
20627,173,25886.197879,29929
20628,13,1644.428040,169
20629,0,0.000000,0
20630,0,0.000000,0


## remove outliers


In [30]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)

    iqr = q3 - q1

    fence_low = q1 - 1.5 * iqr
    fence_high = q3 + 1.5 * iqr

    df_out = df_in.loc[
        (df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)
    ].copy()

    return df_out


In [31]:
len(h6_df)


20632

In [32]:
h6_df = remove_outlier(h6_df, "backers_count")


In [33]:
len(h6_df)


17566

In [34]:
h6_df = remove_outlier(h6_df, "usd_pledged")


In [35]:
len(h6_df)


15470

# Split to X and y


In [36]:
X = h6_df.drop(columns=["usd_pledged"])
y = h6_df["usd_pledged"]

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [38]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [39]:
lr.fit(X_train, y_train)


In [40]:
print(
    "y = {:.2f} + {:.2f}x + {:.2f}x^2".format(lr.intercept_, lr.coef_[0], lr.coef_[1])
)


y = -5.27 + 87.08x + -0.43x^2


In [42]:
from sklearn.metrics import mean_squared_error, r2_score


print(
    "Mean squared error (train): {:.2f}".format(
        mean_squared_error(y_train, lr.predict(X_train))
    )
)
print(
    "Mean squared error (test): {:.2f}".format(
        mean_squared_error(y_test, lr.predict(X_test))
    )
)

Mean squared error (train): 726664.57
Mean squared error (test): 754581.00


In [47]:
print(
    "Root mean squared error (train): {:.2f}".format(
        np.sqrt(mean_squared_error(y_train, lr.predict(X_train)))
    )
)
print(
    "Root mean squared error (test): {:.2f}".format(
        np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
    )
)


Root mean squared error (train): 852.45
Root mean squared error (test): 868.67


In [46]:
from sklearn.metrics import mean_absolute_error

print(
    "Mean absolute error (train): {:.2f}".format(
        mean_absolute_error(y_train, lr.predict(X_train))
    )
)
print(
    "Mean absolute error (test): {:.2f}".format(
        mean_absolute_error(y_test, lr.predict(X_test))
    )
)


Mean absolute error (train): 479.35
Mean absolute error (test): 488.30


In [43]:
print(
    "Coefficient of determination (train): {}".format(
        r2_score(y_train, lr.predict(X_train))
    )
)
print(
    "Coefficient of determination (test): {}".format(
        r2_score(y_test, lr.predict(X_test))
    )
)


Coefficient of determination (train): 0.6601980013443078
Coefficient of determination (test): 0.6686859787841384


In [54]:
# linear regression line with squared term plotted
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=X_train["backers_count"],
        y=y_train,
        mode="markers",
        name="Training Data",
        marker=dict(color="blue"),
    )
)

fig.update_layout(
    title="Linear Regression Line with Squared Term",
    xaxis_title="Backers Count",
    yaxis_title="USD Pledged",
)

fig.add_trace(
    go.Scatter(
        x=X_train["backers_count"],
        y=lr.predict(X_train),
        mode="markers",
        name="Linear Regression Line",
        marker=dict(color="red"),
    )
)

fig.show()

In [44]:
# plot the actual vs predicted
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=lr.predict(X_test),
        mode="markers",
        name="data",
        marker=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_test,
        mode="markers",
        name="fit",
        line=dict(color="red", width=3),
    )
)

fig.update_layout(
    title="Actual vs Predicted",
    xaxis_title="Actual",
    yaxis_title="Predicted",
    font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),
)

fig.show()


In [45]:
# plot the residuals
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y_test,
        y=y_test - lr.predict(X_test),
        mode="markers",
        name="data",
        marker=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=y_test,
        y=[0] * len(y_test),
        mode="markers",
        name="fit",
        line=dict(color="red", width=3),
    )
)

fig.update_layout(
    title="Residuals vs Predicted",
    xaxis_title="Predicted",
    yaxis_title="Residuals",
    font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),
)

fig.show()

In [48]:
import tensorflow as tf


In [49]:
# set the seed
tf.random.set_seed(42)

# create a sequential model
model = tf.keras.models.Sequential()

# add 1 dense layer
model.add(tf.keras.layers.Dense(1, input_shape=(2,)))

# compile the model with adam optimizer and mean squared error loss function
model.compile(
    loss="mean_squared_error",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.015),
    metrics=["mean_squared_error"],
)

# print the summary
model.summary()

# train the model
history = model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=128,
    verbose=1,
    validation_data=(X_test, y_test),
)

# plot the loss
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=list(range(1, 101)),
        y=history.history["loss"],
        mode="lines",
        name="loss",
        line=dict(color="blue"),
    )
)

fig.add_trace(
    go.Scatter(
        x=list(range(1, 101)),
        y=history.history["val_loss"],
        mode="lines",
        name="val_loss",
        line=dict(color="red"),
    )
)

fig.update_layout(
    title="Loss vs Epochs",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),
)

fig.show()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 3         
                                                                 
Total params: 3
Trainable params: 3
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
E