In [24]:

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
users = pd.read_csv("users.csv")
repos = pd.read_csv("repositories.csv")
users.head(), repos.head()


(     login                name          company            location  \
 0  midudev  Miguel Ángel Durán              NaN           Barcelona   
 1       ai       Andrey Sitnik     EVILMARTIANS    Barcelona, Spain   
 2  raysan5                 Ray       RAYLIBTECH           Barcelona   
 3  vfarcic       Viktor Farcic          UPBOUND    Barcelona, Spain   
 4    spite       Jaume Sanchez  GOOGLE-DEEPMIND  London · Barcelona   
 
                       email  hireable  \
 0          miduga@gmail.com     False   
 1          andrey@sitnik.ru     False   
 2         raysan5@gmail.com      True   
 3         viktor@farcic.com     False   
 4  hello@clicktorelease.com     False   
 
                                                  bio  public_repos  followers  \
 0  Te enseño Programación y Desarrollo Web. Cread...           194      28286   
 1  The creator of Autoprefixer, @postcss, @browse...            85       9155   
 2  I make tools and technology for videogames dev...            2

In [25]:

# Q1: Who are the top 5 users in Barcelona with the highest number of followers?
top_5_users = users.nlargest(5, 'followers')[['login']]
top_5_users_list = ",".join(top_5_users['login'].tolist())
top_5_users_list


'midudev,ai,raysan5,vfarcic,spite'

In [26]:

# Q2: Who are the 5 earliest registered GitHub users in Barcelona?
# Convert 'created_at' column to datetime
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')

# Q2: Who are the 5 earliest registered GitHub users in Barcelona?
earliest_users = users.nsmallest(5, 'created_at')[['login']]
earliest_users_list = ",".join(earliest_users['login'].tolist())
earliest_users_list


'oleganza,gravityblast,fesplugas,fxn,pauek'

In [27]:

# Q3: What are the 3 most popular licenses among these users?
top_licenses = repos['license_name'].value_counts().head(3).index.tolist()
top_licenses_list = ",".join(top_licenses)
top_licenses_list


'mit,apache-2.0,other'

In [28]:

# Q4: Which company do the majority of these developers work at?
majority_company = users['company'].mode().iloc[0]
majority_company


'FREELANCE'

In [29]:

# Q5: Which programming language is most popular among these users?
most_popular_language = repos['language'].mode().iloc[0]
most_popular_language


'JavaScript'

In [30]:

# Q6: Which programming language is the second most popular among users who joined after 2020?
users_post_2020 = users.query('created_at > "2020-01-01 00:00:00+00:00"')
repos_2020 = repos[repos['login'].isin(users_post_2020['login'].tolist())]
repos_2020['language'].value_counts().index[1]


'Python'

In [31]:

# Q7: Which language has the highest average number of stars per repository?
highest_avg_stars_language = repos.groupby('language')['stargazers_count'].mean().idxmax()
highest_avg_stars_language


'Vim Script'

In [32]:

# Q8: Define leader_strength and find the top 5 users
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_leader_strength = users.nlargest(5, 'leader_strength')['login'].tolist()
",".join(top_leader_strength)


'midudev,vfarcic,spite,amix,cfenollosa'

In [33]:

# Q9: Correlation between the number of followers and the number of public repositories
correlation, _ = pearsonr(users['followers'], users['public_repos'])
round(correlation, 3)


0.071

In [34]:

# Q10: Linear regression slope of followers on repos
X = users[['public_repos']]
y = users['followers']
reg = LinearRegression().fit(X, y)
slope = round(reg.coef_[0], 3)
slope


1.031

In [35]:
import statsmodels.api as sm

# Prepare the data
X = users['public_repos']  # Independent variable: number of public repositories
y = users['followers']     # Dependent variable: number of followers

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Perform the regression
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())

# Get the coefficient for 'public_repos'
repo_coefficient = model.params['public_repos']
print(f"Estimated additional followers per additional public repository: {repo_coefficient:.3f}")

                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.709
Date:                Wed, 30 Oct 2024   Prob (F-statistic):              0.192
Time:                        21:17:38   Log-Likelihood:                -2972.9
No. Observations:                 337   AIC:                             5950.
Df Residuals:                     335   BIC:                             5957.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          340.2778    111.583      3.050   

In [36]:
has_projects = repos['has_projects'].astype(bool)
has_wiki = repos['has_wiki'].astype(bool)

f"{has_projects.corr(has_wiki):.3f}"

'0.317'

In [37]:

# Q12: Average following difference between hireable and non-hireable users
hireable_following = users[users['hireable'] == True]['following'].mean()
non_hireable_following = users[users['hireable'] != True]['following'].mean()
f"{hireable_following - non_hireable_following:.3f}"


'294.668'

In [38]:

# Q13: Correlation of bio length with followers
users_with_bio = users.loc[users['bio'].notna(), :].copy().reset_index(drop=True, inplace=False)
users_with_bio['bio_length'] = users_with_bio['bio'].str.strip().str.split().str.len()
f"{linregress(users_with_bio['bio_length'], users_with_bio['followers']).slope:.3f}"


NameError: name 'linregress' is not defined

In [39]:

# Q14: Most repositories created on weekends
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])  
            
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))


nilportugues,kinow,ajsb85,vfarcic,wlsf82


In [40]:

# Q15: Difference in email sharing between hireable and non-hireable users
hireable_true = users[users['hireable'] == True]
fraction_hireable_true = hireable_true['email'].notna().mean()

hireable_other = users[users['hireable'] != True]
fraction_hireable_other = hireable_other['email'].notna().mean()

f"{(fraction_hireable_true - fraction_hireable_other):.3f}"


'0.098'

In [41]:

# Q16: Most common surname among users
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))


Martínez,Ortiz
