In [None]:
import requests
import time
import csv

GITHUB_TOKEN = 'token'

headers = {
    'Authorization': f'Bearer {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github+json',
    'X-GitHub-Api-Version': '2022-11-28'
}

def get_users_in_chennai(min_followers=50):
    users = []
    page = 1
    per_page = 30
    while True:
        url = f"https://api.github.com/search/users?q=location:Chennai+followers:>{min_followers}&page={page}&per_page={per_page}"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break
        data = response.json()

        users.extend(data.get('items', []))

        if len(users) >= data['total_count']:
            break

        page += 1

    return users

# get_users_in_chennai()

In [None]:
def clean_company_name(company_name):
    if not company_name:
        return None
    company_name = company_name.strip()
    if company_name.startswith('@'):
        company_name = company_name[1:]
    company_name = company_name.upper()
    return company_name

def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching details for {username}: {response.status_code}")
        return None
    user_data = response.json()
    user_data['company'] = clean_company_name(user_data.get('company'))
    return user_data

# get_user_details('Premalatha-success')

In [None]:
def main():
    users = get_users_in_chennai()
    with open('users.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['login', 'name', 'company', 'location', 'email', 'hireable', 'bio',
                      'public_repos', 'followers', 'following', 'created_at']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for user in users:
            username = user['login']
            print(f"Fetching details for user: {username}")
            user_details = get_user_details(username)
            if user_details:
                writer.writerow({
                    'login': user_details.get('login', ''),
                    'name': user_details.get('name', ''),
                    'company': user_details.get('company', ''),
                    'location': user_details.get('location', ''),
                    'email': user_details.get('email', ''),
                    'hireable': str(user_details.get('hireable', '')).lower() if user_details.get('hireable') is not None else '',
                    'bio': user_details.get('bio', ''),
                    'public_repos': user_details.get('public_repos', 0),
                    'followers': user_details.get('followers', 0),
                    'following': user_details.get('following', 0),
                    'created_at': user_details.get('created_at', '')
                })

main()

In [None]:
from google.colab import files
files.download('users.csv')


In [None]:
import pandas as pd

df = pd.read_csv('users.csv')

df.head()


In [None]:
df['created_at'] = pd.to_datetime(df['created_at'])

earliest_users = df.sort_values(by='created_at', ascending=True).head(5)

earliest_logins = ",".join(earliest_users['login'])
print(earliest_logins)


In [None]:
import pandas as pd
import requests
import time

users_df = pd.read_csv('users.csv', keep_default_na=False)

def get_user_repositories(username):
    repos = []
    page = 1
    per_page = 100
    while True:
        url = f"https://api.github.com/users/{username}/repos?sort=updated&direction=desc&page={page}&per_page={per_page}"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.status_code}")
            break
        data = response.json()

        if not data:
            break

        for repo in data:
            repos.append({
                'login': username,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'] or '',
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else ''
            })

        if len(repos) >= 500:
            repos = repos[:500]
            break

        page += 1

    return repos

def main():
    all_repos = []

    for index, user in users_df.iterrows():
        username = user['login']
        print(f"Fetching repositories for user: {username}")
        user_repos = get_user_repositories(username)
        all_repos.extend(user_repos)

    repos_df = pd.DataFrame(all_repos)

    repos_df.to_csv('repositories.csv', index=False)

main()


In [None]:
from google.colab import files
files.download('repositories.csv')

In [None]:
df = pd.read_csv('repositories.csv')

df.head()


In [None]:
df = pd.read_csv('repositories.csv')

df['has_projects'] = df['has_projects'].astype(str).str.lower()
df['has_wiki'] = df['has_wiki'].astype(str).str.lower()

df.to_csv('repositories.csv', index=False)

In [None]:
repos_df = pd.read_csv('repositories.csv', keep_default_na=False)

licenses = repos_df['license_name'].dropna().replace('', pd.NA)

license_counts = licenses.value_counts()

top_3_licenses = license_counts.head(3)

top_3_license_names = ",".join(top_3_licenses.index)
print(top_3_license_names)


In [None]:
users_df = pd.read_csv('users.csv')

company_counts = users_df['company'].value_counts()

most_frequent_company = company_counts.index[0]

most_frequent_company

In [None]:
language_counts = repos_df['language'].value_counts()

most_frequent_language = language_counts.index[1]

most_frequent_language

In [None]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

recent_users = users_df[users_df['created_at'] > '2020-01-01']

merged_df = pd.merge(recent_users, repos_df, on='login', how='inner')

language_counts = merged_df['language'].value_counts()

second_most_popular_language = language_counts.nlargest(3).index[-1]

print(second_most_popular_language)

In [None]:
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

highest_average_language = average_stars_per_language.idxmax()
highest_average_language

In [None]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)

top_5_logins = ",".join(top_5_leader_strength['login'])
print(top_5_logins)

In [None]:
followers_repos_df = users_df[['followers', 'public_repos']]

correlation = followers_repos_df['followers'].corr(followers_repos_df['public_repos'])

formatted_correlation = round(correlation, 3)

print(formatted_correlation)


In [None]:
import statsmodels.api as sm

X = users_df['public_repos']  # Independent variable
y = users_df['followers']      # Dependent variable

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

slope = model.params['public_repos']

formatted_slope = round(slope, 3)

print(formatted_slope)

In [None]:
repos_df = pd.read_csv('repositories.csv', keep_default_na=False)

print("Unique values in has_projects:", repos_df['has_projects'].unique())
print("Unique values in has_wiki:", repos_df['has_wiki'].unique())

repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)

print("Missing values in has_projects:", repos_df['has_projects'].isna().sum())
print("Missing values in has_wiki:", repos_df['has_wiki'].isna().sum())

correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

formatted_correlation = round(correlation, 3)

print(formatted_correlation)


In [None]:
users_df = pd.read_csv('users.csv', keep_default_na=False)

users_df['hireable'] = users_df['hireable'].replace({'True': True, 'False': False, '': False})

print("Missing values in hireable after replacement:", users_df['hireable'].isna().sum())

users_df['hireable'] = users_df['hireable'].astype(bool)

average_following_hireable = users_df[users_df['hireable']]['following'].astype(int).mean()
average_following_non_hireable = users_df[~users_df['hireable']]['following'].astype(int).mean()

average_difference = average_following_hireable - average_following_non_hireable

formatted_difference = round(average_difference, 3)

print(formatted_difference)


In [None]:
users_with_bio = users_df[users_df['bio'].str.strip() != ''].copy()  # Create a copy to avoid warnings

users_with_bio['bio_length'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

X = users_with_bio['bio_length']  # Independent variable
y = users_with_bio['followers']    # Dependent variable

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

slope = model.params['bio_length']

formatted_slope = round(slope, 3)

print(formatted_slope)

In [None]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek >= 5

weekend_repos = repos_df[repos_df['is_weekend']]
user_weekend_counts = weekend_repos['login'].value_counts().head(5)

top_5_users = user_weekend_counts.index.tolist()

top_5_users_string = ','.join(top_5_users)

print(top_5_users_string)

In [None]:
users_df = pd.read_csv('users.csv', keep_default_na=False)

users_df = users_df[users_df['name'].ne('')]

users_df['surname'] = users_df['name'].str.strip().str.split().str[-1]

surname_counts = users_df['surname'].value_counts()

max_count = surname_counts.max()

most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

most_common_surnames.sort()

result = ','.join(most_common_surnames)

print(result)


In [None]:
users_df = pd.read_csv(list(uploaded.keys())[0], keep_default_na=False)

In [None]:
users_df['hireable'] = users_df['hireable'].replace({'True': True, 'False': False, '': False})
users_df['hireable'] = users_df['hireable'].astype(bool)

print("Unique values in 'hireable':", users_df['hireable'].unique())

users_df['email'] = users_df['email'].fillna('')

hireable_with_email = users_df[users_df['hireable']]['email'].ne('').mean()
non_hireable_with_email = users_df[~users_df['hireable']]['email'].ne('').mean()

difference = round(hireable_with_email - non_hireable_with_email, 3)

print(difference)