In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read the CSV files
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# 1. Top 5 users by followers
def top_followers():
    return ','.join(users_df.nlargest(5, 'followers')['login'].tolist())

# 2. 5 earliest registered users
def earliest_users():
    users_df['created_at'] = pd.to_datetime(users_df['created_at'])
    return ','.join(users_df.nsmallest(5, 'created_at')['login'].tolist())

# 3. Most popular licenses
def popular_licenses():
    return ','.join(repos_df['license_name'].value_counts().head(3).index.tolist())

# 4. Most common company
def most_common_company():
    return users_df['company'].mode()[0]

# 5. Most popular language
def most_popular_language():
    return repos_df['language'].mode()[0]

# 6. Second most popular language for users after 2020
def second_popular_language_post_2020():
    users_df['created_at'] = pd.to_datetime(users_df['created_at'])
    recent_users = users_df[users_df['created_at'].dt.year > 2020]['login']
    recent_repos = repos_df[repos_df['login'].isin(recent_users)]
    return recent_repos['language'].value_counts().index[1]

# 7. Language with highest average stars
def highest_avg_stars_language():
    return repos_df.groupby('language')['stargazers_count'].mean().idxmax()

# 8. Top 5 by leader_strength
def top_leader_strength():
    users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
    return ','.join(users_df.nlargest(5, 'leader_strength')['login'].tolist())

# 9. Correlation between followers and repos
def followers_repos_correlation():
    return round(users_df['followers'].corr(users_df['public_repos']), 3)

# 10. Regression slope of followers on repos
from sklearn.linear_model import LinearRegression
def followers_repos_regression():
    X = users_df[['public_repos']]
    y = users_df['followers']
    reg = LinearRegression().fit(X, y)
    return round(reg.coef_[0], 3)

# 11. Correlation between projects and wiki
def projects_wiki_correlation():
    repos_df['has_projects'] = repos_df['has_projects'].map({'true': 1, 'false': 0})
    repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': 1, 'false': 0})
    return round(repos_df['has_projects'].corr(repos_df['has_wiki']), 3)

# 12. Hireable users following difference
def hireable_following_diff():
    hireable = users_df[users_df['hireable'] == 'true']['following'].mean()
    not_hireable = users_df[users_df['hireable'] != 'true']['following'].mean()
    return round(hireable - not_hireable, 3)

# 13. Bio length correlation with followers
def bio_followers_correlation():
    users_df['bio_words'] = users_df['bio'].fillna('').str.split().str.len()
    users_with_bio = users_df[users_df['bio_words'] > 0]
    X = users_with_bio[['bio_words']]
    y = users_with_bio['followers']
    reg = LinearRegression().fit(X, y)
    return round(reg.coef_[0], 3)

# 14. Top weekend repository creators
def weekend_repo_creators():
    repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
    weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek.isin([5, 6])]
    weekend_counts = weekend_repos['login'].value_counts()
    return ','.join(weekend_counts.head(5).index.tolist())

# 15. Hireable email sharing difference
def hireable_email_diff():
    hireable_email = users_df[users_df['hireable'] == 'true']['email'].notna().mean()
    other_email = users_df[users_df['hireable'] != 'true']['email'].notna().mean()
    return round(hireable_email - other_email, 3)

# 16. Most common surname
def most_common_surname():
    users_df['surname'] = users_df['name'].str.split().str[-1]
    return users_df['surname'].value_counts().iloc[0]

# Print all results
print("1.", top_followers())
print("2.", earliest_users())
print("3.", popular_licenses())
print("4.", most_common_company())
print("5.", most_popular_language())
print("6.", second_popular_language_post_2020())
print("7.", highest_avg_stars_language())
print("8.", top_leader_strength())
print("9.", followers_repos_correlation())
print("10.", followers_repos_regression())
print("11.", projects_wiki_correlation())
print("12.", hireable_following_diff())
print("13.", bio_followers_correlation())
print("14.", weekend_repo_creators())
print("15.", hireable_email_diff())
print("16.", most_common_surname())

1. brianyu28,PatrickAlphaC,KeithGalli,CharlesCreativeContent,timbl
2. evan,dpickett,tel,radical,joshuaclayton
3. mit,other,apache-2.0
4. NORTHEASTERN UNIVERSITY
5. JavaScript
6. C#
7. SQL
8. nikomatsakis,ccoenraets,KeithGalli,rstudio,pluskid
9. 0.168
10. 1.221
11. nan
12. 129.637
13. -5.674
14. berquist,lizbur10,burtbeckwith,jimkang,rwaldron
15. 0.111
16. 4


Few codes with correct answers

16


In [2]:
import csv
from collections import Counter

# Counter to store surname frequencies
surname_counter = Counter()

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        name = row.get('name', '').strip()
        if name:  # Ignore missing names
            # Split the name by whitespace and get the last word as the surname
            surname = name.split()[-1]
            surname_counter[surname] += 1

# Find the maximum frequency of surnames
if surname_counter:
    max_count = max(surname_counter.values())
    # Get all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Output the result
    print(f"{', '.join(most_common_surnames)}: {max_count}")
else:
    print("No names found.")

Williams: 4


13

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

def analyze_bio_followers_correlation(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)
    
    # Filter out rows without bios
    df = df[df['bio'].notna() & (df['bio'] != '')]
    
    # Calculate bio length in Unicode characters
    df['bio_length'] = df['bio'].str.len()
    
    # Prepare data for regression
    X = df['bio_length'].values.reshape(-1, 1)
    y = df['followers'].values
    
    # Perform linear regression
    model = LinearRegression()
    model.fit(X, y)
    
    # Get the slope rounded to 3 decimal places
    slope = round(model.coef_[0], 3)
    
    # Print debug information
    print(f"Number of users with bios: {len(df)}")
    print(f"Bio length range: {df['bio_length'].min()} to {df['bio_length'].max()}")
    print(f"Followers range: {df['followers'].min()} to {df['followers'].max()}")
    print(f"R-squared: {model.score(X, y):.3f}")
    
    return slope

# Calculate the regression slope
result = analyze_bio_followers_correlation()
print(f"\nRegression slope: {result:.3f}")

Number of users with bios: 264
Bio length range: 4 to 160
Followers range: 119 to 13203
R-squared: 0.001

Regression slope: -0.912


6

In [4]:
import csv
from collections import Counter
from datetime import datetime

# Define the list to store programming languages
languages = []

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    # Iterate through the rows in the CSV
    for row in reader:
        # Parse the created_at field
        created_at = row.get('created_at', '').strip()
        
        # Convert the date string to a datetime object
        if created_at:
            user_join_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
            
            # Check if the user joined after 2020
            if user_join_date.year > 2020:
                # Get the language field and clean it up
                language = row.get('language', '').strip()
                if language:
                    languages.append(language)

# Count the occurrence of each language
language_counts = Counter(languages)

# Find the two most common languages
most_common_languages = language_counts.most_common(2)

# Print the second most common language
if len(most_common_languages) >= 2:
    print(most_common_languages[1][0])  # Second most common language
else:
    print("Not enough language data found.")

Python
