## Based on the name of an existing kickstarter project, 5 of the most similar projects will be recommended to the user.

In [1]:
import pandas as pd
import json
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_colwidth', -1)

#### Read in csv file from external Github url, containing large list of kickstarter projects

In [3]:
dfnew = pd.read_csv('https://raw.githubusercontent.com/L-Lewis/Kickstarter-success-machine-learning/master/data/Kickstarter000.csv')

#### Combine multiple features which provide valuable information on a project into one string per project. Use the rake NLP library to extract keywords from the blurb summary phrase

In [5]:
def combine_features(row):
    category = json.loads(row['category'])
    r = Rake()
    r.extract_keywords_from_text(row['blurb'])
    arr = r.get_ranked_phrases()
    return category['name']+" "+category['slug']+" "+ " ".join(arr)

dfnew["combinedfeatures"] = dfnew.apply(combine_features, axis=1)

#### Create functions to get a kickstarter project's name given its index, and vice versa

In [30]:
def get_title_from_index(index):
    return dfnew[dfnew.index == index]['name'].apply(str)

def get_index_from_title(title):
    return dfnew[dfnew['name']==title].index[0]

#### Use the CountVectorizer model from scikit-learn to handle similarity of combined feature word data between kickstarter projects. Afterwards, use the cosine similarity matrix to determine which records are most similar to the chosen record - values closest to one indicate the least amount of variance within the two projects' combined features.

In [33]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(dfnew["combinedfeatures"])
cosine_sim = cosine_similarity(count_matrix)

userlikes = "New Final Round Album"
index = get_index_from_title(userlikes)
similar_projects = list(enumerate(cosine_sim[index]))
sortedprojects = sorted(similar_projects, key=lambda x: x[1], reverse=True)

dftest = pd.DataFrame()

for i in range(1,6):
    index = sortedprojects[i][0]
    print(round(sortedprojects[i][1],2))
    print(get_title_from_index(index))
    print('\n')
    

0.51
1051    Moon Debris LP
Name: name, dtype: object


0.49
515    Click here to help KHP finish their first Full Length album
Name: name, dtype: object


0.48
331    Railway Glass First Studio Album
Name: name, dtype: object


0.45
127    The Sons of Kirk: SciFi-Themed Rock Album and Music Video!
Name: name, dtype: object


0.45
344    L'Amourita
Name: name, dtype: object


