In [1]:
import pandas as pd

# Load the CSV data into a pandas DataFrame
data = pd.read_csv('repositories.csv')

# Display the first few rows of the DataFrame
data.head()

Unnamed: 0,Name,Description,URL,Created At,Updated At,Homepage,Size,Stars,Forks,Issues,...,Has Issues,Has Projects,Has Downloads,Has Wiki,Has Pages,Has Discussions,Is Fork,Is Archived,Is Template,Default Branch
0,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,https://github.com/freeCodeCamp/freeCodeCamp,2014-12-24T17:49:19Z,2023-09-21T11:32:33Z,http://contribute.freecodecamp.org/,387451,374074,33599,248,...,True,True,True,False,True,False,False,False,False,main
1,free-programming-books,:books: Freely available programming books,https://github.com/EbookFoundation/free-progra...,2013-10-11T06:50:37Z,2023-09-21T11:09:25Z,https://ebookfoundation.github.io/free-program...,17087,298393,57194,46,...,True,False,True,False,True,False,False,False,False,main
2,awesome,😎 Awesome lists about all kinds of interesting...,https://github.com/sindresorhus/awesome,2014-07-11T13:42:37Z,2023-09-21T11:18:22Z,,1441,269997,26485,61,...,True,False,True,False,True,False,False,False,False,main
3,996.ICU,Repo for counting stars and contributing. Pres...,https://github.com/996icu/996.ICU,2019-03-26T07:31:14Z,2023-09-21T08:09:01Z,https://996.icu,187799,267901,21497,16712,...,False,False,True,False,False,False,False,True,False,master
4,coding-interview-university,A complete computer science study plan to beco...,https://github.com/jwasham/coding-interview-un...,2016-06-06T02:34:12Z,2023-09-21T10:54:48Z,,20998,265161,69434,56,...,True,False,True,False,False,False,False,False,False,main


In [2]:
data.columns

Index(['Name', 'Description', 'URL', 'Created At', 'Updated At', 'Homepage',
       'Size', 'Stars', 'Forks', 'Issues', 'Watchers', 'Language', 'License',
       'Topics', 'Has Issues', 'Has Projects', 'Has Downloads', 'Has Wiki',
       'Has Pages', 'Has Discussions', 'Is Fork', 'Is Archived', 'Is Template',
       'Default Branch'],
      dtype='object')

In [3]:
columns_to_keep = ['Name', 'Description', 'Stars', 'Language', 'Topics']
if all(col in data.columns for col in columns_to_keep):
    data = data[columns_to_keep]
else:
    print("One or more columns do not exist in the DataFrame.")


In [4]:
data.head()

Unnamed: 0,Name,Description,Stars,Language,Topics
0,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,374074,TypeScript,"['careers', 'certification', 'community', 'cur..."
1,free-programming-books,:books: Freely available programming books,298393,,"['books', 'education', 'hacktoberfest', 'list'..."
2,awesome,😎 Awesome lists about all kinds of interesting...,269997,,"['awesome', 'awesome-list', 'lists', 'resource..."
3,996.ICU,Repo for counting stars and contributing. Pres...,267901,,[]
4,coding-interview-university,A complete computer science study plan to beco...,265161,,"['algorithm', 'algorithms', 'coding-interview'..."


In [5]:
data.dropna(inplace=True)
data.head()

Unnamed: 0,Name,Description,Stars,Language,Topics
0,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,374074,TypeScript,"['careers', 'certification', 'community', 'cur..."
5,public-apis,A collective list of free APIs,256615,Python,"['api', 'apis', 'dataset', 'development', 'fre..."
6,developer-roadmap,"Interactive roadmaps, guides and other educati...",251416,TypeScript,"['angular-roadmap', 'backend-roadmap', 'blockc..."
7,system-design-primer,Learn how to design large-scale systems. Prep ...,229569,Python,"['design', 'design-patterns', 'design-system',..."
9,react,The library for web and native user interfaces,213299,JavaScript,"['declarative', 'frontend', 'javascript', 'lib..."


In [6]:
data.iloc[23451]


Name                      webpack-visualizer
Description    Visualize your Webpack bundle
Stars                                   1685
Language                          JavaScript
Topics         ['webpack', 'webpack-plugin']
Name: 25927, dtype: object

In [7]:
data.head()

Unnamed: 0,Name,Description,Stars,Language,Topics
0,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,374074,TypeScript,"['careers', 'certification', 'community', 'cur..."
5,public-apis,A collective list of free APIs,256615,Python,"['api', 'apis', 'dataset', 'development', 'fre..."
6,developer-roadmap,"Interactive roadmaps, guides and other educati...",251416,TypeScript,"['angular-roadmap', 'backend-roadmap', 'blockc..."
7,system-design-primer,Learn how to design large-scale systems. Prep ...,229569,Python,"['design', 'design-patterns', 'design-system',..."
9,react,The library for web and native user interfaces,213299,JavaScript,"['declarative', 'frontend', 'javascript', 'lib..."


In [9]:
#time for some feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [10]:
# NLP processing for 'Description'
tfidf = TfidfVectorizer(max_features=100)  # Adjust parameters as needed
description_features = tfidf.fit_transform(data['Description'])

In [13]:
# One-hot encoding for 'Language'
encoder = OneHotEncoder()
language_features = encoder.fit_transform(data[['Language']])

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Normalizing 'Stars'
scaler = MinMaxScaler()
stars_features = scaler.fit_transform(data[['Stars']])

In [20]:
# Assuming 'Topics' is a list of topics in string format, like "['topic1', 'topic2', ...]"
import ast  # For safely evaluating the string representation of a list

# Convert the string representation of lists into actual lists
data['Topics'] = data['Topics'].apply(ast.literal_eval)

# Join the topics into a single string per entry
data['Topics_joined'] = data['Topics'].apply(lambda x: ' '.join(x))

# NLP processing for 'Topics'
tfidf_topics = TfidfVectorizer(max_features=100)  # You can adjust the parameters
topics_features = tfidf_topics.fit_transform(data['Topics_joined'])

In [22]:
import scipy.sparse as sp

# Combine all features into one dataset
all_features = sp.hstack([description_features, language_features, stars_features, topics_features])

# Convert to a format suitable for training (e.g., to a dense array if the dataset is not too large)
all_features_dense = all_features.toarray()


#### Prepare labels?

Only needed for supervised models!

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

user_input = ""
user_input_processed = tfidf.transform([user_input])

N = 10

# Calculate similarity
similarity_scores = cosine_similarity(user_input_processed, description_features)  # Assuming 'description_features' is your repository feature matrix

# Get top N similar repositories
top_n_indices = similarity_scores.argsort()[0][-N:]  # Replace N with the number of recommendations you want
top_n_repos = data.iloc[top_n_indices]
top_n_repos

Unnamed: 0,Name,Description,Stars,Language,Topics,Topics_joined
80261,SQuAD-explorer,Visually Explore the Stanford Question Answeri...,521,JavaScript,"[dataset, leaderboard, visual-analysis]",dataset leaderboard visual-analysis
160563,bl_iot_sdk,"BL602/BL702 SDK. Any technical topic, please a...",238,C,"[aiot, bl602, bl60x, bl702, bl706, bl70x, ble,...",aiot bl602 bl60x bl702 bl706 bl70x ble bouffal...
36837,Tomb,the Crypto Undertaker,1195,Shell,"[crypto, dyne, encryption, files, filesystem, ...",crypto dyne encryption files filesystem keys l...
175145,csm,The C(canonical) Scan Matcher,214,C,[],
11971,genshin-wish-export,Easily export the Genshin Impact wish record.,3438,JavaScript,"[genshin, genshin-impact]",genshin genshin-impact
62169,mongo-spark,The MongoDB Spark Connector,685,Java,"[connector, mongo-spark, mongodb, spark, spark...",connector mongo-spark mongodb spark spark-pack...
154916,CSSans.Pro,"CSSans Pro - The Colourful, Sassy, CSS Font",248,JavaScript,"[css, cssans, font, pro]",css cssans font pro
85087,btpd,⚡ The BitTorrent Protocol Daemon,488,C,"[bittorrent, c, daemon, torrent]",bittorrent c daemon torrent
62105,prjxray,Documenting the Xilinx 7-series bit-stream for...,685,Python,"[artix, artix7, bitstream, fpga, fuzzer, kinte...",artix artix7 bitstream fpga fuzzer kintex7 sym...
156932,sakila,The Sakila Database,245,PLpgSQL,[],
