In [None]:
# import packages
import pandas as pd
import numpy as np
import sys
import re
import functools

sys.path.append("../tool/")

import preprocess

In [None]:
# load data
df2017 = pd.read_csv("../data/OriginalData/developer_survey_2017/survey_results_public_2017.csv")
df2018 = pd.read_csv("../data/OriginalData/developer_survey_2018/survey_results_public.csv")

In [None]:
# choose the fields
columns = [
    "Professional",
    "University",
    "FormalEducation",
    "Gender",
    "Race",
    "Country",
    "Salary",
    "Currency",
    "CompanySize",
    "DeveloperType",
    "JobSatisfaction",
    "JobSeekingStatus",
    "Methodology",
    "WorkStart",
    "MetricAssess",
    "LastNewJob",
    "SelfTaughtTypes",
    "TimeAfterBootcamp",
    "EducationTypes"
    
]

# field with single value
add_columns = {
    "SalaryType": "Yearly",
    "YearsCoding": np.nan,
    "OpenSource":np.nan,
}

# rename the fields mapper
rename_columns = {
    "EmploymentStatus":"Employment",
    "MajorUndergrad" : "UndergradMajor",
    "WantWorkLanguage": "LanguageDesireNextYear",
    "HaveWorkedLanguage" : "LanguageWorkedWith",
    "WantWorkDatabase" : "DatabaseDesireNextYear",
    "HaveWorkedDatabase" : "DatabaseWorkedWith",
    "HaveWorkedPlatform" : "PlatformWorkedWith",
    "WantWorkPlatform" : "PlatformDesireNextYear",
    "ProgramHobby" : "Hobby",
    "ResumePrompted" : "UpdateCV"
}

In [None]:
# get the dataframe contained the fields choosed
for column in rename_columns.keys():
    columns.append(column)
    
df2017_certain = preprocess.extract_data(df2017, columns)
# df2017.filter(items=columns, axis=1).copy()

# add the new filds
for key, value in add_columns.items():
    df2017_certain[key] = value
    
# rename the columns name
df2017_certain.rename(rename_columns, axis=1, inplace=True)

In [None]:
df2017_certain.head()

In [None]:
# convert the element that is not Male or Female into NoComment
df2017_certain.Gender = df2017_certain.Gender.apply(preprocess.convert_single_func, 
                                                    args=(["Male", "Female"], 
                                                         "NoComment"))
# convert the Race no information value into NoInfo
# regrex pattern
pattern = re.compile(r"I don’t know|I prefer not to say")
df2017_certain.Race = df2017_certain.Race.apply(preprocess.convert_single_func, args=(pattern, "NoInfo", True))

# extract the value in the front of the first ;
df2017_certain.Race = df2017_certain.Race.str.split(";").apply(preprocess.convert_list_funct)


# extract the currecy value that is a alphabet value without a sign
pattern = re.compile(r"(\w*.*\s+\w*)+")
df2017_certain.Currency = df2017_certain.Currency.apply(preprocess.convert_single_func, args=(pattern, False, True))

In [None]:
# convert the COmpanySize no information value into NoInfo
pattern = re.compile(r"(I don't know)|(I prefer not to answer)")
df2017_certain.CompanySize = df2017_certain.CompanySize.apply(preprocess.convert_single_func, args=(pattern, "NoInfo", True))

In [None]:
# convert value about the JobSatisfaction field
def convert_job(x):
    # convert the x value into a string value according by the int value
    
    if x == 0:
        return "Extremely dissatisfied"
    elif x <= 2:
        return "Moderately dissatisfied"
    elif x <= 4:
        return "Slightly dissatisfied"
    elif x == 5:
        return "Neither satisfied nor dissatisfied"
    elif x <= 7:
        return "Slightly satisfied"
    elif x <= 9:
        return "Moderately satisfied"
    elif x == 10:
        return "Extremely satisfied"
    else:
        return x

df2017_certain.JobSatisfaction = df2017_certain.JobSatisfaction.apply(convert_job)

In [None]:
# merge the value about YearsCodedJob and YearsCodedJobPast into the field YearsCoding
years_coding = []
for index, row in df2017[["YearsCodedJob", "YearsCodedJobPast"]].iterrows():
    if not pd.isnull(row.YearsCodedJobPast) and pd.isnull(row.YearsCodedJob):
        years_coding.append(row.YearsCodedJobPast)
    else:
        years_coding.append(row.YearsCodedJob)
        
df2017_certain.YearsCoding = years_coding

In [None]:
# Todo: Fix the values into uniform about YearsCoding field
years_coding_dict = {
    "Less than a year": "0-2 years",
    "1 to 2 years": "0-2 years",
    "2 to 3 years" : "0-2 years",
    "3 to 4 years" : "3-5 years",
    "4 to 5 years" : "3-5 years", 
    "5 to 6 years" : "3-5 years", 
    "6 to 7 years" : "6-8 years", 
    "7 to 8 years" : "6-8 years", 
    "8 to 9 years" : "6-8 years", 
    "9 to 10 years" : "9-11 years",
    "10 to 11 years" : "9-11 years",
    "11 to 12 years" : "9-11 years",
    "12 to 13 years" : "12-14 years",
    "13 to 14 years" : "12-14 years",
    "14 to 15 years" : "12-14 years",
    "15 to 16 years" : "15-17 years",
    "16 to 17 years" : "15-17 years",
    "17 to 18 years" : "15-17 years",
    "18 to 19 years" : "18-20 years",
    "19 to 20 years" : "18-20 years",
}

df2017_certain.YearsCoding.replace(years_coding_dict, inplace=True)

In [None]:
# ToDO: fix the values into uniform about UpdateCV field
update_cv = {
    'I received bad news about the future of my company or depart' : 'I received bad news about the future of my company or department',
    'I completed a major project, assignment, or contract' : 'My job status or other personal status changed',
    "I saw an employer's advertisement" : 'I saw an employer’s advertisement'
}

df2017_certain.UpdateCV.replace(update_cv, inplace=True)

In [None]:
# Todo: fix the value into uniform about DeveloperType field
develop_type = {
    'Web developer': 'Web developer',
    'Mobile developer': 'Mobile developer',
    'Desktop applications developer': 'Desktop or enterprise applications developer',
    'Other': 'Other',
    'Embedded applications/devices developer': 'Embedded applications or devices developer',
    'Developer with a statistics or mathematics background': 'Developer with a statistics or mathematics background',
    'Data scientist': 'Data scientist or machine learning specialist',
    'DevOps specialist': 'DevOps specialist',
    'Quality assurance engineer': 'QA or test developer',
    'Database administrator': 'Database administrator',
    'Graphics programming': 'Graphics programming',
    'Machine learning specialist': 'Data scientist or machine learning specialist',
    'Systems administrator': 'Systems administrator',
    'Graphic designer': 'Designer',
    'Systems administrator' : 'System administrator',
    'Developer with a statistics or mathematics background': 'Educator or academic researcher',
    'Graphics programming' : 'Game or graphics developer'
}

for pat, repl in develop_type.items():
    preprocess.replace_value(df2017_certain, pat, repl, "DeveloperType", inplace=True)
# df2017_certain.DeveloperType.replace(develop_type, inplace=True)

In [None]:
# Todo: fix the value with redundant space
for pattern, replacement in zip([r"^ | $", r"; ", r" ;"], ["", ";", ";"]):
    for field in ["LanguageDesireNextYear", "LanguageWorkedWith", 
                  "DatabaseWorkedWith", "DatabaseWorkedWith", 
                  "PlatformWorkedWith", "PlatformDesireNextYear",
                 "MetricAssess", "SelfTaughtTypes", "EducationTypes",
                 "DeveloperType"]:
        
        preprocess.replace_value(df2017_certain, pattern, replacement, field, inplace=True)


In [None]:
# Todo: extract values from the field ProgramHobby to new fields  Hobby and OpenSource
df2017_certain.OpenSource = df2017_certain.Hobby.replace({
    "Yes, I program as a hobby":"No", 
    "Yes, both":"Yes",
    "Yes, I contribute to open source projects":"Yes"
})

df2017_certain.Hobby.replace({
    "Yes, I program as a hobby":"Yes", 
    "Yes, both":"Yes",
    "Yes, I contribute to open source projects":"No"
}, inplace=True)

In [None]:
# Todo: replace value about Not applicable/ never with I've never had a job
df2017_certain.LastNewJob.replace({
    "Not applicable/ never": "I've never had a job"
}, inplace=True)

In [None]:
# Todo: replace value about field TimeAfterBootcamp
df2017_certain.TimeAfterBootcamp.replace({
    "I already had a job as a developer when I started the program" : "I already had a full-time job as a developer when I began the program",
    "Immediately upon graduating":"Immediately after graduating",
    "I haven't gotten a job as a developer yet": "I haven’t gotten a developer job",
}, inplace=True)

**There are three fields in the 2018, that have to converte the values**

* DeveloperType
* SelfTaughtTypes
* YearsCoding

In [None]:
develop_type ={
    'Web developer': 'Web developer',
    'Mobile developer': 'Mobile developer',
    'Desktop applications developer': 'Desktop or enterprise applications developer',
    'Other': 'Other',
    'Embedded applications/devices developer': 'Embedded applications or devices developer',
    'Developer with a statistics or mathematics background': 'Developer with a statistics or mathematics background',
    'Data scientist': 'Data scientist or machine learning specialist',
    'DevOps specialist': 'DevOps specialist',
    'Quality assurance engineer': 'QA or test developer',
    'Database administrator': 'Database administrator',
    'Graphics programming': 'Graphics programming',
    'Machine learning specialist': 'Data scientist or machine learning specialist',
    'Systems administrator': 'Systems administrator',
    'Graphic designer': 'Designer'
}

In [None]:
self_taught = {
    'A book or e-book from O’Reilly, Apress, or a similar publisher' : "Textbook",
    'A college/university computer science or software engineering book' : 'Trade book',
    'Internal Wikis, chat rooms, or documentation set up by my company for employees' :'Company internal community',
    'Online developer communities other than Stack Overflow (ex. forums, listservs, IRC channels, etc.)' : 'Non-Stack online communities',
    'Pre-scheduled tutoring or mentoring sessions with a friend or colleague': 'Tutoring/mentoring',
    'Questions & answers on Stack Overflow':'Stack Overflow Q&A',
    'Tapping your network of friends, family, and peers versed in the technology' : 'Friends network',
    'The official documentation and/or standards for the technology' : 'Official documentation',
    'The technology’s online help system' : 'Built-in help',
}

In [None]:
education_types = {
    'Completed an industry certification program (e.g. MCPD)':'Industry certification',
    'Contributed to open source software' :'Open source contributions',
    'Participated in a full-time developer training program or bootcamp' : 'Bootcamp',
    'Participated in a hackathon' : 'Hackathon',
    'Participated in online coding competitions (e.g. HackerRank, CodeChef, TopCoder)' : 'Coding competition',
    'Received on-the-job training in software development' : 'On-the-job training',
    'Taken a part-time in-person course in programming or software development' : 'Part-time/evening course',
    'Taken an online course in programming or software development (e.g. a MOOC)' : 'Online course',
     'Taught yourself a new language, framework, or tool without taking a formal course' : 'Self-taught',
}

In [None]:
years_coding = {
    '21-23 years':'20 or more years',
    '24-26 years':'20 or more years',
    '27-29 years':'20 or more years',
    '30 or more years':'20 or more years', 
}

In [None]:
raise

In [None]:
df2017_certain.LastNewJob.value_counts(dropna=False).index.difference(df2018.LastNewJob.value_counts(dropna=False).index)

In [None]:
df2017_certain.YearsCoding.value_counts(dropna=False)

In [None]:
df2018.YearsCoding.value_counts(dropna=False)

In [None]:
raise

In [None]:
testset = set()
for i in df2017_certain.YearsCoding.str.split(";"):
    if isinstance(i, list):
            for x in i:
                testset.add(x)

In [None]:
testset

In [None]:
testset2 = set()
for i in df2018.YearsCoding.str.split(";"):
    if isinstance(i, list):
            for x in i:
                testset2.add(x)

In [None]:
testset2

In [None]:
testset2.difference(testset)

In [None]:
df2018.YearsCoding.replace(years_coding, inplace=True)

In [None]:
testset.difference(testset2)

In [None]:
testset.union(testset2)

In [None]:
'I saw an employer’s advertisement'
"I saw an employer's advertisement"

In [None]:
df2017_certain[~(df2017_certain.DeveloperType.str.find(";")>0)].DeveloperType.value_counts()

In [None]:
df2018[~(df2018.DevType.str.find(";")>0)].DevType.value_counts()

In [None]:
raise

In [None]:
df2017_certain.info()

In [None]:
# store the data
df2017_certain.to_csv("../data/survey2017.csv", index=False)