In [1]:
import pandas as pd

Mount drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
users_df = pd.read_csv('/content/drive/MyDrive/TDS-Proj1/users.csv')
repos_df = pd.read_csv('/content/drive/MyDrive/TDS-Proj1/repositories.csv')

1. Who are the top 5 users in Bangalore with the highest number of followers? List their login in order in comma separated

In [6]:
top_5_followers_login = users_df.sort_values('followers', ascending=False).head(5)['login'].tolist()
top_5_followers_login_csv = ','.join(top_5_followers_login)
top_5_followers_login_csv

'krishnaik06,championswimmer,arpitbbhayani,manjunath5496,tanaypratap'

2. Who are the 5 earliest registered GitHub users in Bangalore? List their login in ascending order of created_at, comma-separated.

In [7]:
oldest_users = users_df.sort_values('created_at', ascending=True).head(5)['login']
oldest_users_csv = ','.join(oldest_users)
oldest_users_csv

'anandology,irfn,jace,abhisek,abhin4v'

3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order comma separated

In [8]:
freq_licenses = repos_df['license_name'].dropna().value_counts().head(3).index.tolist()
freq_licenses_csv = ','.join(freq_licenses)
freq_licenses_csv

'MIT License,Apache License 2.0,Other'

4. Which company do the majority of these developers work at?

In [9]:
most_common_company = users_df['company'].value_counts().idxmax()
most_common_company

'GOOGLE'

5. Which programming language is most popular among these users?

In [10]:
most_popular_language = repos_df['language'].value_counts().idxmax()
most_popular_language


'JavaScript'

6. Which programming language is the second most popular among users who joined after 2020?

In [11]:
post_2020_users = users_df[users_df['created_at'] > '2020-01-01']
post_2020_repos = repos_df[repos_df['login'].isin(post_2020_users['login'])]
second_most_popular_language = post_2020_repos['language'].value_counts().nlargest(2).index[-1]
second_most_popular_language

'HTML'

7. Which language has the highest average number of stars per repository?

In [12]:
language_avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
language_avg_stars

'Pascal'

8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [13]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leaders = users_df.sort_values('leader_strength', ascending=False).head(5)['login'].tolist()
top_5_leaders_csv = ','.join(top_5_leaders)
top_5_leaders_csv

'krishnaik06,Hack-with-Github,laxmimerit,tanaypratap,abhishh1'

9. What is the correlation between the number of followers and the number of public repositories among users in Bangalore?

In [28]:
followers_and_repos_corr = users_df['followers'].corr(users_df['public_repos'])
followers_and_repos_corr
print(f"{followers_and_repos_corr:.3f}")

0.191


10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [27]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(users_df[['public_repos']], users_df['followers'])
followers_per_repo = model.coef_[0]
followers_per_repo
print(f"{followers_per_repo:.3f}")

2.340


11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [26]:
projects_and_wiki_corr = repos_df['has_wiki'].corr(repos_df['has_projects'])
projects_and_wiki_corr
print(f"{projects_and_wiki_corr:.3f}")

0.198


12. Do hireable users follow more people than those who are not hireable?

In [25]:
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] != True]['following'].mean()
print(avg_following_hireable)
print(avg_following_non_hireable)
following_diff = avg_following_hireable - avg_following_non_hireable
following_diff
print(f"{following_diff:.3f}")

213.48648648648648
149.3909090909091
64.096


13. Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

In [30]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import re

def count_unicode_words(paragraph):
    # Split the paragraph by whitespace and filter out any empty strings
    words = paragraph.split()
    # Return the count of words
    return len(words)

# Calculate the length of bios
users_df['bio_length'] = users_df['bio'].dropna().apply(count_unicode_words)

# Filter out users without bios
filtered_df = users_df[users_df['bio_length'].notna()]

# Prepare the data for regression
X = filtered_df[['bio_length']]
y = filtered_df['followers']

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the regression slope
regression_slope = model.coef_[0]

# Output the result
print(f"Regression slope of followers on bio length: {regression_slope:.3f}")

Regression slope of followers on bio length: 0.331


14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [21]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['is_weekend'] = repos_df['created_at'].dt.weekday >= 5
weekend_repos = repos_df[repos_df['is_weekend']]
top_5_weekend_spenders = weekend_repos['login'].value_counts().head(5).index.tolist()
top_5_weekend_spenders_csv = ','.join(top_5_weekend_spenders)
top_5_weekend_spenders_csv

'itaditya,avinassh,sangam14,Tivotal,manjunath5496'

15. Do people who are hireable share their email addresses more often?

In [22]:
import pandas as pd

# Count users with emails when hireable = True
hireable_users = users_df[users_df['hireable'] == True]
hireable_with_email = hireable_users['email'].notna().sum()
hireable_fraction = hireable_with_email / len(hireable_users) if len(hireable_users) > 0 else 0

# Count users with emails when hireable = False
non_hireable_users = users_df[users_df['hireable'] != True]
non_hireable_with_email = non_hireable_users['email'].notna().sum()
non_hireable_fraction = non_hireable_with_email / len(non_hireable_users) if len(non_hireable_users) > 0 else 0

# Calculate the difference
email_difference = hireable_fraction - non_hireable_fraction

# Output the result
print(f"Difference in email sharing: {email_difference:.3f}")

Difference in email sharing: 0.189


16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [24]:
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])
most_common_surname = users_df['surname'].value_counts().nlargest(2).index.tolist()
users_df['surname'].value_counts()
most_common_surname_csv = ','.join(most_common_surname)
most_common_surname_csv

'Kumar,Singh'