**Data Mining Group 4 Jupyter Notebook**

By: Assaf Bohen, Adrian Nica, TJ Jablonski

In [6]:
import requests
import zipfile
import io
import pandas as pd
from urllib.parse import urlparse
import os

def download_and_extract_csv(zip_url):
    """
    Downloads a ZIP file from `zip_url`, extracts the first .csv file found,
    and returns it as a Pandas DataFrame.
    """
    #determines if zip is found
    response = requests.get(zip_url)
    if response.status_code == 200:
        zip_content = io.BytesIO(response.content)
        
        #extract csv files
        with zipfile.ZipFile(zip_content, 'r') as zip_ref:
            for file_name in zip_ref.namelist():
                if file_name.endswith('.csv'):
                    with zip_ref.open(file_name) as csv_file:
                        df = pd.read_csv(csv_file)

                        #print success
                        parsed_url = urlparse(zip_url)
                        print(f'Successfully extracted {file_name} from {os.path.basename(parsed_url.path)}')
                        return df

        #no csv files found
        return None
    else:
        print(f"Failed to retrieve ZIP from {zip_url}")
        return None

In [7]:
#urls to zip files
csv_url_2024 = "https://cdn.sanity.io/files/jo7n4k8s/production/262f04c41d99fea692e0125c342e446782233fe4.zip/stack-overflow-developer-survey-2024.zip"
csv_url_2023 = "https://cdn.stackoverflow.co/files/jo7n4k8s/production/49915bfd46d0902c3564fd9a06b509d08a20488c.zip/stack-overflow-developer-survey-2023.zip"
csv_url_2022 = "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip"

#download csv files
odf_2024 = download_and_extract_csv(csv_url_2024)
odf_2023 = download_and_extract_csv(csv_url_2023)
odf_2022 = download_and_extract_csv(csv_url_2022)

Successfully extracted survey_results_public.csv from stack-overflow-developer-survey-2024.zip
Successfully extracted survey_results_public.csv from stack-overflow-developer-survey-2023.zip
Successfully extracted survey_results_public.csv from stack-overflow-developer-survey-2022.zip


In [10]:
#converts column names to uppercase and removes spacing
odf_2024.columns = odf_2024.columns.str.upper().str.replace(" ", "")
odf_2023.columns = odf_2023.columns.str.upper().str.replace(" ", "")
odf_2022.columns = odf_2022.columns.str.upper().str.replace(" ", "")

#prints all rows
with pd.option_context('display.max_columns', None,
                       'display.max_rows', None,
                       'display.width', 6000):
    print(odf_2024.head())

   RESPONSEID                      MAINBRANCH                 AGE           EMPLOYMENT REMOTEWORK   CHECK                                   CODINGACTIVITIES                                            EDLEVEL                                          LEARNCODE                                    LEARNCODEONLINE                                            TECHDOC YEARSCODE YEARSCODEPRO                DEVTYPE ORGSIZE PURCHASEINFLUENCE BUYNEWTOOL BUILDVSBUY TECHENDORSE                                            COUNTRY CURRENCY  COMPTOTAL                             LANGUAGEHAVEWORKEDWITH                             LANGUAGEWANTTOWORKWITH                                    LANGUAGEADMIRED           DATABASEHAVEWORKEDWITH      DATABASEWANTTOWORKWITH             DATABASEADMIRED                    PLATFORMHAVEWORKEDWITH                    PLATFORMWANTTOWORKWITH                           PLATFORMADMIRED                  WEBFRAMEHAVEWORKEDWITH            WEBFRAMEWANTTOWORKWITH               WEBFRA

In [21]:
#drop all cols from odf_2024 that didn't select Apples (only survey to check for bots)
print(f'odf_2024 rows before dropping bots: {odf_2024.shape[0]}')
odf_2024 = odf_2024[odf_2024['CHECK'] == 'Apples']
print(f'odf_2024 rows after dropping bots: {odf_2024.shape[0]}\n')

#finds common column names across all dfs
common_cols = set(odf_2024.columns).intersection(odf_2023.columns).intersection(odf_2022.columns)

#creates copies of dfs
df_2024 = odf_2024[list(common_cols)].copy()
df_2023 = odf_2023[list(common_cols)].copy()
df_2022 = odf_2022[list(common_cols)].copy()

#adds a year column to all dfs
df_2024["year"] = 2024
df_2023["year"] = 2023
df_2022["year"] = 2022

#combined the common columns of the dfs
combined_df = pd.concat([df_2024, df_2023, df_2022], ignore_index=True)

#prints shape and all rows
print(f"Combined DataFrame shape: {combined_df.shape}")
with pd.option_context('display.max_columns', None,
                       'display.max_rows', None,
                       'display.width', 6000):
    print(combined_df.head())

odf_2024 rows before dropping bots: 65437
odf_2024 rows after dropping bots: 65437

Combined DataFrame shape: (227889, 63)
                                             EDLEVEL                  ICORPM                                   CODINGACTIVITIES  RESPONSEID BUYNEWTOOL FREQUENCY_1 PURCHASEINFLUENCE                 OFFICESTACKSYNCHAVEWORKEDWITH           SOCOMM                           MISCTECHHAVEWORKEDWITH  WORKEXP KNOWLEDGE_2 KNOWLEDGE_1 KNOWLEDGE_3                      TOOLSTECHHAVEWORKEDWITH                 KNOWLEDGE_6 KNOWLEDGE_7         OPSYSPERSONALUSE                DEVTYPE KNOWLEDGE_4           EMPLOYMENT                                    LEARNCODEONLINE SURVEYEASE OFFICESTACKSYNCWANTTOWORKWITH                                         NEWSOSITES                             LANGUAGEHAVEWORKEDWITH                 AGE                      MAINBRANCH FREQUENCY_2            WEBFRAMEWANTTOWORKWITH  COMPTOTAL                  WEBFRAMEHAVEWORKEDWITH SOACCOUNT ORGSIZE FREQUENCY_3 

**Job Satisfaction Data Exploration**

In [19]:
#show all columns
print(list(odf_2024.columns))

['RESPONSEID', 'MAINBRANCH', 'AGE', 'EMPLOYMENT', 'REMOTEWORK', 'CHECK', 'CODINGACTIVITIES', 'EDLEVEL', 'LEARNCODE', 'LEARNCODEONLINE', 'TECHDOC', 'YEARSCODE', 'YEARSCODEPRO', 'DEVTYPE', 'ORGSIZE', 'PURCHASEINFLUENCE', 'BUYNEWTOOL', 'BUILDVSBUY', 'TECHENDORSE', 'COUNTRY', 'CURRENCY', 'COMPTOTAL', 'LANGUAGEHAVEWORKEDWITH', 'LANGUAGEWANTTOWORKWITH', 'LANGUAGEADMIRED', 'DATABASEHAVEWORKEDWITH', 'DATABASEWANTTOWORKWITH', 'DATABASEADMIRED', 'PLATFORMHAVEWORKEDWITH', 'PLATFORMWANTTOWORKWITH', 'PLATFORMADMIRED', 'WEBFRAMEHAVEWORKEDWITH', 'WEBFRAMEWANTTOWORKWITH', 'WEBFRAMEADMIRED', 'EMBEDDEDHAVEWORKEDWITH', 'EMBEDDEDWANTTOWORKWITH', 'EMBEDDEDADMIRED', 'MISCTECHHAVEWORKEDWITH', 'MISCTECHWANTTOWORKWITH', 'MISCTECHADMIRED', 'TOOLSTECHHAVEWORKEDWITH', 'TOOLSTECHWANTTOWORKWITH', 'TOOLSTECHADMIRED', 'NEWCOLLABTOOLSHAVEWORKEDWITH', 'NEWCOLLABTOOLSWANTTOWORKWITH', 'NEWCOLLABTOOLSADMIRED', 'OPSYSPERSONALUSE', 'OPSYSPROFESSIONALUSE', 'OFFICESTACKASYNCHAVEWORKEDWITH', 'OFFICESTACKASYNCWANTTOWORKWITH', '