<a href="https://colab.research.google.com/github/adith-ds/project1/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

udf = pd.read_csv('/content/users.csv')
rdf = pd.read_csv('/content/repositories.csv')
udf.columns
rdf.columns



In [None]:
top_5_followers = udf.sort_values(by='followers', ascending=False).head(5)

print(top_5_followers)

In [None]:
top_5_early = udf.sort_values(by='created_at', ascending=True).head(5)

print(top_5_early)

In [None]:
popular_licenses = rdf[rdf['license_name'] != ''].license_name.value_counts().head(3).index
# Convert to comma-separated list
print(','.join(popular_licenses))

In [None]:
most_common_company = udf[udf['company'] != ''].company.mode()[0]
print(most_common_company)


In [None]:
most_popular_language = rdf[rdf['language'] != ''].language.mode()[0]
print(most_popular_language)

In [None]:
recent_users = udf[udf['created_at'] >= '2020-01-01']

# Get repositories of these users
recent_repos = rdf[rdf['login'].isin(recent_users['login'])]

# Get the second most frequent language (ignore missing languages)
second_most_popular_language = recent_repos[recent_repos['language'] != ''].language.value_counts().index[1]
print(second_most_popular_language)

In [None]:
language_stars = rdf.groupby('language')['stargazers_count'].mean().sort_values(ascending=False).index[0]
print(language_stars)

In [None]:
udf['leader_strength'] = udf['followers'] / (1 + udf['following'])
top_5_leader_strength = udf.sort_values(by='leader_strength', ascending=False)['login'].head(5)
# Convert to comma-separated list
print(','.join(top_5_leader_strength))

In [None]:
correlation = udf['followers'].corr(udf['public_repos'])
print(correlation)

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Prepare the data
X = udf['public_repos'].values.reshape(-1, 1)  # Independent variable: public_repos
y = udf['followers'].values  # Dependent variable: followers

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the coefficient (followers gained per additional public repo)
followers_per_repo = model.coef_[0]
print(followers_per_repo)

In [None]:
correlation_projects_wiki = rdf['has_projects'].corr(rdf['has_wiki'])
print(correlation_projects_wiki)

In [None]:
hireable_following_mean = udf.groupby('hireable')['following'].mean()
print(hireable_following_mean)

In [None]:
print(213.540541 - 149.409091)

In [None]:
users_with_bios = udf[udf['bio'] != '']

# Calculate the length of each bio
users_with_bios['bio_length'] = users_with_bios['bio'].str.len()

# Calculate the correlation between bio length and followers
correlation_bio_followers = users_with_bios['bio_length'].corr(users_with_bios['followers'])
print(correlation_bio_followers)

In [None]:
rdf['created_at'] = pd.to_datetime(rdf['created_at'])
rdf['weekday'] = rdf['created_at'].dt.weekday

# Filter for repositories created on weekends
weekend_repos = rdf[rdf['weekday'].isin([5, 6])]

# Find the top 5 users who created the most repositories on weekends
top_5_weekend_creators = weekend_repos['login'].value_counts().head(5).index
# Convert to comma-separated list
print(','.join(top_5_weekend_creators))

In [None]:
# Group by hireable status and check the proportion of users with non-empty email addresses
email_sharing_by_hireable = udf.groupby('hireable')['email'].apply(lambda x: x.notnull().mean())
print(email_sharing_by_hireable)


In [None]:
# Remove missing names, trim whitespace, and get the last word (surname)
udf_with_names = udf[udf['name'].notnull()].copy()  # Use .copy() to avoid working with a view
udf_with_names['surname'] = udf_with_names['name'].str.split().str[-1].str.upper()

# Get the count of each surname
surname_counts = udf_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Find all surnames with the maximum count
most_common_surnames = surname_counts[surname_counts == max_count]

# Output the number of users with the most common surname(s)
print("Number of users with the most common surname(s):", max_count)
print("Most common surname(s):", ','.join(most_common_surnames.index))




In [None]:
# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation_projects_wiki = rdf['has_projects'].corr(rdf['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation_projects_wiki:.3f}")


In [None]:
# Step 1: Filter out users without names
users_with_names = udf[udf['name'].notnull()].copy()

# Step 2: Extract the surname (last word in the name)
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]

# Step 3: Count occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Step 4: Identify the most common surname(s)
most_common_count = surname_counts.max()  # Maximum occurrence count
most_common_surnames = surname_counts[surname_counts == most_common_count].index.tolist()  # List of most common surnames

# Join surnames if there are multiple
most_common_surnames_str = ', '.join(sorted(most_common_surnames))

# Number of users with the most common surname
number_of_users_with_common_surname = most_common_count

# Output results
print(f"Most common surname(s): {most_common_surnames_str}")
print(f"Number of users with the most common surname: {number_of_users_with_common_surname}")
