In [1]:

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
users = pd.read_csv("users.csv")
repos = pd.read_csv("repositories.csv")
users.head(), repos.head()


(     login                name          company            location  \
 0  midudev  Miguel Ángel Durán              NaN           Barcelona   
 1       ai       Andrey Sitnik     EVILMARTIANS    Barcelona, Spain   
 2  raysan5                 Ray       RAYLIBTECH           Barcelona   
 3  vfarcic       Viktor Farcic          UPBOUND    Barcelona, Spain   
 4    spite       Jaume Sanchez  GOOGLE-DEEPMIND  London · Barcelona   
 
                       email  hireable  \
 0          miduga@gmail.com     False   
 1          andrey@sitnik.ru     False   
 2         raysan5@gmail.com      True   
 3         viktor@farcic.com     False   
 4  hello@clicktorelease.com     False   
 
                                                  bio  public_repos  followers  \
 0  Te enseño Programación y Desarrollo Web. Cread...           194      28286   
 1  The creator of Autoprefixer, @postcss, @browse...            85       9155   
 2  I make tools and technology for videogames dev...            2

In [2]:

# Q1: Who are the top 5 users in Barcelona with the highest number of followers?
top_5_users = users.nlargest(5, 'followers')[['login']]
top_5_users_list = ",".join(top_5_users['login'].tolist())
top_5_users_list


'midudev,ai,raysan5,vfarcic,spite'

In [3]:

# Q2: Who are the 5 earliest registered GitHub users in Barcelona?
# Convert 'created_at' column to datetime
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')

# Q2: Who are the 5 earliest registered GitHub users in Barcelona?
earliest_users = users.nsmallest(5, 'created_at')[['login']]
earliest_users_list = ",".join(earliest_users['login'].tolist())
earliest_users_list


'oleganza,gravityblast,fesplugas,fxn,pauek'

In [4]:

# Q3: What are the 3 most popular licenses among these users?
top_licenses = repos['license_name'].value_counts().head(3).index.tolist()
top_licenses_list = ",".join(top_licenses)
top_licenses_list


'mit,apache-2.0,other'

In [5]:

# Q4: Which company do the majority of these developers work at?
majority_company = users['company'].mode().iloc[0]
majority_company


'FREELANCE'

In [6]:

# Q5: Which programming language is most popular among these users?
most_popular_language = repos['language'].mode().iloc[0]
most_popular_language


'JavaScript'

In [7]:

# Q6: Which programming language is the second most popular among users who joined after 2020?
language_after_2020 = repos[repos['created_at'] > '2020-01-01']['language'].value_counts().index[1]
language_after_2020


'TypeScript'

In [8]:

# Q7: Which language has the highest average number of stars per repository?
highest_avg_stars_language = repos.groupby('language')['stargazers_count'].mean().idxmax()
highest_avg_stars_language


'Vim Script'

In [9]:

# Q8: Define leader_strength and find the top 5 users
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_leader_strength = users.nlargest(5, 'leader_strength')['login'].tolist()
",".join(top_leader_strength)


'midudev,vfarcic,spite,amix,cfenollosa'

In [10]:

# Q9: Correlation between the number of followers and the number of public repositories
correlation, _ = pearsonr(users['followers'], users['public_repos'])
round(correlation, 3)


0.071

In [11]:

# Q10: Linear regression slope of followers on repos
X = users[['public_repos']]
y = users['followers']
reg = LinearRegression().fit(X, y)
slope = round(reg.coef_[0], 3)
slope


1.031

In [12]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})
    
correlation = repos['has_projects'].corr(repos['has_wiki'])
    
print(round(correlation, 3))

0.317


In [13]:

# Q12: Average following difference between hireable and non-hireable users
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference


294.6677803379416

In [14]:

# Q13: Correlation of bio length with followers
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
round(lr2.coef_[0], 3)


1.716

In [15]:

# Q14: Most repositories created on weekends
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])  
            
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))


nilportugues,kinow,ajsb85,vfarcic,wlsf82


In [16]:

# Q15: Difference in email sharing between hireable and non-hireable users
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff


0.09750384024577574

In [17]:

# Q16: Most common surname among users
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))


Martínez,Ortiz
