# Importing libraries


In [1]:
import pandas as pd
import numpy as np


# Load data


In [2]:
kickstarter_df = pd.read_csv(
    "./data/Part2.Team8.kickstarter_data_full.csv", low_memory=False
)

# Drop unnecessary columns


In [3]:
kickstarter_df.drop(columns=["index", "Unnamed: 0", "id"], inplace=True)

# Naive feature selection


## Remove features with more that 50% missing values


In [4]:
kickstarter_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 66 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   photo                        20632 non-null  object 
 1   name                         20632 non-null  object 
 2   blurb                        20626 non-null  object 
 3   goal                         20632 non-null  float64
 4   pledged                      20632 non-null  float64
 5   state                        20632 non-null  object 
 6   slug                         20632 non-null  object 
 7   disable_communication        20632 non-null  bool   
 8   country                      20632 non-null  object 
 9   currency                     20632 non-null  object 
 10  currency_symbol              20632 non-null  object 
 11  currency_trailing_code       20632 non-null  bool   
 12  deadline                     20632 non-null  object 
 13  state_changed_at

In [5]:
kickstarter_df.drop(
    columns=["friends", "is_starred", "is_backing", "permissions"], inplace=True
)


## Remove features with zero variance


In [6]:
kickstarter_df.nunique()[kickstarter_df.nunique() == 1]


Series([], dtype: int64)

## Check how much rows with missing values


In [7]:
kickstarter_df.isnull().any(axis=1).sum()


1937

## check for features that have missing values


In [8]:
kickstarter_df.isnull().sum()[kickstarter_df.isnull().sum() > 0]


blurb                 6
location             45
category           1889
name_len              5
name_len_clean        5
blurb_len             5
blurb_len_clean       5
dtype: int64

### handle missing values for blurb


In [9]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)


### handle missing values for category


In [10]:
catgory_dict = {
    name: i for i, name in enumerate(kickstarter_df["category"].unique().tolist())
}


def get_category_id(x):
    return np.nan if pd.isnull(x) else catgory_dict[x]


# transform all
kickstarter_df["category"] = kickstarter_df["category"].apply(get_category_id)
kickstarter_df["category"] = knn_imputer.fit_transform(
    kickstarter_df["category"].values.reshape(-1, 1)
)

# inverse transform category
kickstarter_df["category"] = kickstarter_df["category"].apply(
    lambda x: list(catgory_dict.keys())[int(x)]
)

### handle missing values for name_len


In [11]:
kickstarter_df["name_len"] = knn_imputer.fit_transform(kickstarter_df[["name_len"]])

### handle missing values for name_len_clean


In [12]:
kickstarter_df["name_len_clean"] = knn_imputer.fit_transform(
    kickstarter_df[["name_len_clean"]]
)


### handle missing values for blurb_len


In [13]:
kickstarter_df["blurb_len"] = knn_imputer.fit_transform(kickstarter_df[["blurb_len"]])

### handle missing values for blurb_len_clean


In [14]:
kickstarter_df["blurb_len_clean"] = knn_imputer.fit_transform(
    kickstarter_df[["blurb_len_clean"]]
)


### handle missing values for blurb


In [15]:
kickstarter_df["blurb"].fillna("", inplace=True)


## Normalize the goal amount to USD to match the usd_pledged


In [16]:
kickstarter_df["usd_goal"] = kickstarter_df["goal"] * kickstarter_df["static_usd_rate"]

## Select only the numerical columns


In [17]:
kickstarter_numeric_df = kickstarter_df.select_dtypes(include=["number"]).drop(
    columns=["static_usd_rate", "goal"]
)  # drop exchange rate and goal columns because they are not determined the success of the project

## Visualize the feature importance via xgboost


In [18]:
import plotly.graph_objects as go
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(
    kickstarter_numeric_df.drop(columns=["SuccessfulBool"]),
    kickstarter_numeric_df["SuccessfulBool"],
)
feature_importance_df = pd.DataFrame(
    {
        "feature": kickstarter_numeric_df.drop(columns=["SuccessfulBool"]).columns,
        "importance": xgb_model.feature_importances_,
    }
)
feature_importance_df.sort_values(by="importance", ascending=False, inplace=True)
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=feature_importance_df["feature"],
        y=feature_importance_df["importance"],
        marker_color="rgb(171, 226, 251)",
    )
)
fig.update_layout(
    title="Feature Importance",
    title_x=0.5,
    width=800,
    height=800,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
)
fig.show()

## Visualize dendrogram


In [19]:
import plotly.figure_factory as ff

fig = ff.create_dendrogram(
    kickstarter_numeric_df.corr(),
    orientation="left",
    labels=kickstarter_numeric_df.columns,
)
fig.update_layout(
    title="Dendrogram",
    title_x=0.5,
    width=1000,
    height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
)
fig.show()


## Visualize bar plot for the cases of success and failure


In [20]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=["Failed", "Successful"],
        y=kickstarter_df["SuccessfulBool"].value_counts().values,
        marker_color=["rgb(251, 171, 171)", "rgb(171, 226, 251)"],
        # percentage label
        text=[
            str(
                np.round(
                    val / kickstarter_df["SuccessfulBool"].shape[0] * 100,
                    2,
                )
            )
            + "%"
            for val in kickstarter_df["SuccessfulBool"].value_counts().values
        ],
        textposition="auto",
    )
)
fig.update_layout(
    title="Number of Successful and Failed Projects",
    title_x=0.5,
    width=800,
    height=800,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
)
fig.show()


## Visualize splom plot for the numeric variables


In [21]:
fig = go.Figure()
fig.add_trace(
    go.Splom(
        dimensions=[
            dict(
                label="backers_count",
                values=kickstarter_numeric_df["backers_count"],
            ),
            dict(label="usd_goal", values=kickstarter_numeric_df["usd_goal"]),
            dict(
                label="usd_pledged",
                values=kickstarter_numeric_df["usd_pledged"],
            ),
        ],
        text=[
            "Failed" if val == 0 else "Successful"
            for val in kickstarter_numeric_df["SuccessfulBool"]
        ],
        marker=dict(
            color=kickstarter_numeric_df["SuccessfulBool"],
            colorscale=[[0, "rgb(251, 100, 100)"], [1, "rgb(100, 100, 251)"]],
        ),
        customdata=kickstarter_numeric_df["SuccessfulBool"],
        hovertemplate="<b>backers_count</b>:"
        + "%{x}<br><b>usd_goal</b>:"
        + "%{y}<br><b>usd_pledged</b>:"
        + "%{z}<br><b>Successful</b>:"
        + "%{customdata}",
    )
)
fig.update_layout(
    title="Scatterplot Matrix for Numeric Variables",
    title_x=0.5,
    width=800,
    height=800,
)
fig.show()


## Visualize correlation Matrix


In [22]:
corr = (
    kickstarter_numeric_df[
        [
            "backers_count",
            "usd_goal",
            "usd_pledged",
        ]
    ]
    .corr()
    .round(2)
)
fig = ff.create_annotated_heatmap(
    z=corr.values,
    x=list(corr.columns),
    y=list(corr.index),
    annotation_text=corr.round(2).values,
    showscale=True,
)
fig.update_layout(
    title="Correlation Matrix for Numeric Variables",
    title_font_size=20,
    title_x=0.5,
    width=800,
    height=800,
    font=dict(size=14),
)
fig.show()


## Visualize scatter plot for the relationship between usd_goal and usd_pledged


In [23]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=kickstarter_df["usd_goal"],
        y=kickstarter_df["usd_pledged"],
        mode="markers",
        marker=dict(
            color=kickstarter_df["SuccessfulBool"],
            colorscale=[[0, "rgb(251, 100, 100)"], [1, "rgb(100, 100, 251)"]],
        ),
        customdata=kickstarter_df["SuccessfulBool"],
        hovertemplate="<b>usd_goal</b>: %{x}<br><b>usd_pledged</b>: %{y}<br><b>Successful</b>: %{customdata}",
    )
)
fig.add_trace(
    go.Scatter(
        x=[0, kickstarter_df["usd_goal"].max()],
        y=[0, kickstarter_df["usd_goal"].max()],
        mode="lines",
        marker=dict(color="rgb(0, 0, 0)"),
        hoverinfo="skip",
    )
)
fig.update_layout(
    title="Scatterplot for the Ratio between usd_pledged and usd_goal",
    title_x=0.5,
    width=800,
    height=800,
)
fig.show()


# Visualize pie chart for categorical variables


## Visualize pie chart for country


In [24]:
country_df = kickstarter_df["country"].value_counts().to_frame().reset_index()
# map the country codes to country names
country_df["country"] = country_df["country"].map(
    {
        "US": "United States",
        "GB": "United Kingdom",
        "CA": "Canada",
        "AU": "Australia",
        "DE": "Germany",
        "FR": "France",
        "NL": "Netherlands",
        "IT": "Italy",
        "ES": "Spain",
        "SE": "Sweden",
        "MX": "Mexico",
        "NZ": "New Zealand",
        "DK": "Denmark",
        "IE": "Ireland",
        "CH": "Switzerland",
        "NO": "Norway",
        "BE": "Belgium",
        "AT": "Austria",
        "HK": "Hong Kong",
        "SG": "Singapore",
        "LU": "Luxembourg",
        "JP": "Japan",
    }
)
country_norm = kickstarter_df.value_counts(["country"], normalize=True)
country_df["norm"] = country_norm.values
country_df["label"] = country_df.apply(
    lambda x: x["country"] if x["norm"] >= 0.03 else "other", axis=1
)
other_country_df = country_df[country_df["label"] == "other"]
country_df.drop(other_country_df.index, inplace=True)
group_other_country_df = other_country_df.groupby("label").sum().reset_index()
country_df = pd.concat([country_df, group_other_country_df], axis=0)
country_df.drop(columns=["country"], inplace=True)
import plotly.graph_objects as go

# Pie chart for the distribution of the country, category, currency, created_at_weekday
# under 3
fig = go.Figure()
fig.add_trace(
    go.Pie(
        labels=country_df["label"],
        values=country_df["count"],
        name="country",
        text=country_df["label"],
    )
)
fig.update_layout(
    width=800,
    height=800,
    title_text="Distribution of the country",
    annotations=[
        dict(
            showarrow=False,
            x=0.5,
            y=0.5,
            text="",
            font_size=20,
            font_family="Arial",
        )
    ],
)

fig.show()


## Visualize pie chart for the currency


In [25]:
currency_df = kickstarter_df["currency"].value_counts().to_frame().reset_index()
currency_df["currency"] = currency_df["currency"].map(
    {
        "USD": "US Dollar",
        "GBP": "British Pound Sterling",
        "CAD": "Canadian Dollar",
        "AUD": "Australian Dollar",
        "EUR": "Euro",
        "SEK": "Swedish Krona",
        "NZD": "New Zealand Dollar",
        "DKK": "Danish Krone",
        "NOK": "Norwegian Krone",
        "CHF": "Swiss Franc",
        "MXN": "Mexican Peso",
        "HKD": "Hong Kong Dollar",
        "SGD": "Singapore Dollar",
        "JPY": "Japanese Yen",
    }
)
currency_norm = kickstarter_df.value_counts(["currency"], normalize=True)
currency_df["norm"] = currency_norm.values
currency_df["label"] = currency_df.apply(
    lambda x: x["currency"] if x["norm"] >= 0.03 else "other", axis=1
)
other_currency_df = currency_df[currency_df["label"] == "other"]
currency_df.drop(other_currency_df.index, inplace=True)
group_other_currency_df = other_currency_df.groupby("label").sum().reset_index()
currency_df = pd.concat([currency_df, group_other_currency_df], axis=0)
currency_df.drop(columns=["currency"], inplace=True)
import plotly.graph_objects as go

# Pie chart for the distribution of the country, category, currency, created_at_weekday
# under 3
fig = go.Figure()
fig.add_trace(
    go.Pie(
        labels=currency_df["label"],
        values=currency_df["count"],
        name="currency",
        text=currency_df["label"],
    )
)
fig.update_layout(
    width=800,
    height=800,
    title_text="Distribution of the currency",
    annotations=[
        dict(
            showarrow=False,
            x=0.5,
            y=0.5,
            text="",
            font_size=20,
            font_family="Arial",
        )
    ],
)
fig.show()

## Visualize pie chart for the category


In [26]:
category_df = kickstarter_df["category"].value_counts().to_frame().reset_index()
category_norm = kickstarter_df.value_counts(["category"], normalize=True)
category_df["norm"] = category_norm.values
category_df["label"] = category_df.apply(
    lambda x: x["category"] if x["norm"] >= 0.03 else "other", axis=1
)
other_category_df = category_df[category_df["label"] == "other"]
category_df.drop(other_category_df.index, inplace=True)
group_other_category_df = other_category_df.groupby("label").sum().reset_index()
category_df = pd.concat([category_df, group_other_category_df], axis=0)
category_df.drop(columns=["category"], inplace=True)
import plotly.graph_objects as go

# Pie chart for the distribution of the country, category, currency, created_at_weekday
# under 3
fig = go.Figure()
fig.add_trace(
    go.Pie(
        labels=category_df["label"],
        values=category_df["count"],
        name="category",
        text=category_df["label"],
    )
)
fig.update_layout(
    width=800,
    height=800,
    title_text="Distribution of the category",
    annotations=[
        dict(
            showarrow=False,
            x=0.5,
            y=0.5,
            text="",
            font_size=20,
            font_family="Arial",
        )
    ],
)
fig.show()

## drop columns that no relevant for our analysis


In [27]:
relevant_for_analysis_columns = [
    "backers_count",
    "blurb",
    "category",
    "pledged",
    "usd_pledged",
    "usd_goal",
    "currency",
    "name",
    "SuccessfulBool",
]

relevant_kickstarter_df = kickstarter_df[relevant_for_analysis_columns]

# Check again for null values


In [28]:
relevant_kickstarter_df.isnull().any(axis=1).sum()


0

In [29]:
describe_df = relevant_kickstarter_df.describe().astype("Float64")

In [31]:
describe_df

Unnamed: 0,backers_count,pledged,usd_pledged,usd_goal,SuccessfulBool
count,20632.0,20632.0,20632.0,20632.0,20632.0
mean,183.675843,21392.675739,20915.907911,87721.243602,0.291683
std,1222.012658,120497.251802,115471.73309,1284944.863582,0.454548
min,0.0,0.0,0.0,0.702277,0.0
25%,2.0,25.0,25.0,4000.0,0.0
50%,12.0,695.0,716.301193,13749.96797,0.0
75%,63.0,5954.25,6004.628177,45247.9017,1.0
max,105857.0,6225354.98,6225354.98,100000000.0,1.0


In [38]:
# relevant_kickstarter_df.to_csv(
#     "./data/Part3.Team8.kickstarter_data_full_cleaned.csv", index=False
# )


In [37]:
# describe_df.to_csv(
#     "./data/Part4.Team8.kickstarter_data_full_cleaned_describe.csv", index=False
# )