In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("survey_results_public.csv")

In [2]:
relevant_columns = ['Age', 'Age1stCode', 'CompFreq', 'ConvertedComp', 
                    'Country', 'EdLevel', 'Employment', 'Gender',
                   'LanguageWorkedWith', 'LanguageDesireNextYear', 'MiscTechWorkedWith', 
                    'MiscTechDesireNextYear', 'UndergradMajor', 'YearsCode']

In [3]:
df = df[relevant_columns]

In [4]:
df = df.dropna()

In [5]:
df = df.rename({"ConvertedComp": "Salary"}, axis=1)

In [6]:
df = df[df["Salary"].notnull()]
df.head()

Unnamed: 0,Age,Age1stCode,CompFreq,Salary,Country,EdLevel,Employment,Gender,LanguageWorkedWith,LanguageDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,UndergradMajor,YearsCode
7,36.0,12,Yearly,116000.0,United States,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Man,Python;SQL,JavaScript,Ansible,Unity 3D,"Computer science, computer engineering, or sof...",17
9,22.0,14,Yearly,32315.0,United Kingdom,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed full-time,Man,HTML/CSS;Java;JavaScript;Python;SQL,HTML/CSS;Java;JavaScript;Python;R;SQL,Pandas,Pandas;TensorFlow,Mathematics or statistics,8
10,23.0,13,Yearly,40070.0,United Kingdom,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,Man,C#;JavaScript;Swift,Go;JavaScript;Swift;TypeScript,Node.js,Node.js;React Native,"Computer science, computer engineering, or sof...",10
11,49.0,42,Monthly,14268.0,Spain,Some college/university study without earning ...,Employed full-time,Man,HTML/CSS;JavaScript,HTML/CSS;JavaScript,.NET,.NET,Mathematics or statistics,7
13,27.0,13,Yearly,66000.0,United States,"Associate degree (A.A., A.S., etc.)",Employed full-time,Man,HTML/CSS;JavaScript;SQL;TypeScript,HTML/CSS;JavaScript;SQL;TypeScript,Node.js,Node.js,"Computer science, computer engineering, or sof...",5


In [7]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [8]:
country_map = shorten_categories(df.Country.value_counts(), 400)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

Other             6916
United States     4331
United Kingdom    1353
India             1229
Germany           1060
Canada             699
Brazil             638
France             582
Netherlands        424
Australia          419
Name: Country, dtype: int64

In [9]:
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [10]:
def clean_gender(x):
    if x == 'Man':
        return 'Man'
    if x == 'Woman':
        return 'Woman'
    return 'Other'

df['Gender'] = df['Gender'].apply(clean_gender)

In [11]:
def clean_undergradmajor(x):
    if 'Computer science' in x:
        return 'Computer science'
    if 'natural science' in x:
        return 'Natural science'
    if 'information technology' in x:
        return 'Information technology'
    if 'humanities' in x:
        return 'Humanities'
    if 'social science' in x :
        return "Social science"
    if 'Web development' in x:
        return 'Web development'
    if 'Another engineering discipline' in x:
        return 'Another engineering discipline'
    if 'never' in x:
        return "No major"
    if 'business' in x:
        return "business"
    if 'health science' in x:
        return "Health science"
    if 'Fine arts' in x:
        return "Fine arts"
    return x

df["UndergradMajor"] = df["UndergradMajor"].apply(clean_undergradmajor)

In [12]:
def clean_language_worked_with(i):
    if ";" in i:
        k = i.index(";")
        j = i[:k]
        return j
    else:
        return i
    
df["LanguageWorkedWith"] = df["LanguageWorkedWith"].apply(clean_language_worked_with)

In [13]:
def desired_language(i):
    if ";" in i:
        k = i.index(";")
        j = i[:k]
        return j
    else:
        return i
    
df["LanguageDesireNextYear"] = df["LanguageDesireNextYear"].apply(desired_language)

In [14]:
def misc_clean(i):
    if ";" in i:
        k = i.index(";")
        j = i[:k]
        return j
    else:
        return i
    
df["MiscTechWorkedWith"] = df["MiscTechWorkedWith"].apply(misc_clean)

In [15]:
def desired_misc_clean(i):
    if ";" in i:
        k = i.index(";")
        j = i[:k]
        return j
    else:
        return i
    
df["MiscTechDesireNextYear"] = df["MiscTechDesireNextYear"].apply(desired_misc_clean)

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
le_age = LabelEncoder()
df['Age'] = le_age.fit_transform(df['Age'])
df['Age'].unique()

array([24,  6,  8, 37, 13, 11, 10, 27, 22, 23, 20, 42, 32, 26, 36, 15, 18,
        5, 16, 30, 33, 17, 12, 25, 21, 28, 31, 29, 51, 41, 34, 38, 45, 35,
        3, 39,  4, 40, 59, 43, 44, 46, 52, 48, 49, 53, 54, 47, 64, 62, 61,
       58, 50,  1, 57, 55,  9, 60, 63, 65,  7,  2, 14,  0, 56, 19],
      dtype=int64)

In [18]:
le_age1 = LabelEncoder()
df['Age1stCode'] = le_age1.fit_transform(df['Age1stCode'])
df['Age1stCode'].unique()

array([ 2,  4,  3, 32,  5, 20,  0, 39,  1, 11,  6,  8,  7, 15, 40, 41,  9,
       13, 37, 38, 31, 25, 17, 10, 36, 16, 19, 26, 12, 18, 14, 22, 29, 21,
       23, 30, 35, 27, 28, 24, 33, 34])

In [19]:
le_freq = LabelEncoder()
df['CompFreq'] = le_freq.fit_transform(df['CompFreq'])
df['CompFreq'].unique()

array([2, 0, 1])

In [20]:
le_country = LabelEncoder()
df['Country'] = le_country.fit_transform(df['Country'])
df['Country'].unique()

array([9, 8, 7, 3, 1, 2, 4, 6, 5, 0])

In [21]:
le_ed = LabelEncoder()
df['EdLevel'] = le_ed.fit_transform(df['EdLevel'])
df['EdLevel'].unique()

array([0, 2, 1, 3])

In [22]:
le_employ = LabelEncoder()
df['Employment'] = le_employ.fit_transform(df['Employment'])
df['Employment'].unique()

array([0, 2, 1])

In [23]:
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])
df['Gender'].unique()

array([0, 2, 1])

In [24]:
le_lww = LabelEncoder()
df['LanguageWorkedWith'] = le_lww.fit_transform(df['LanguageWorkedWith'])
df['LanguageWorkedWith'].unique()

array([15,  7,  3,  1,  0,  6, 10,  2,  4,  9, 22,  5, 20, 16, 13, 14, 12,
        8, 11, 19, 17, 21, 18])

In [25]:
le_ld = LabelEncoder()
df['LanguageDesireNextYear'] = le_ld.fit_transform(df['LanguageDesireNextYear'])
df['LanguageDesireNextYear'].unique()

array([10,  7,  6,  3,  1, 16, 12,  0,  2,  5,  8, 11,  4, 23,  9, 19, 13,
       21, 18, 17, 20, 22, 24, 14, 15])

In [26]:
le_mt = LabelEncoder()
df['MiscTechWorkedWith'] = le_mt.fit_transform(df['MiscTechWorkedWith'])
df['MiscTechWorkedWith'].unique()

array([ 2, 10,  9,  0,  8,  1,  7,  3, 16,  5,  6, 14, 12,  4, 17, 13, 11,
       18, 15])

In [27]:
le_mtd = LabelEncoder()
df['MiscTechDesireNextYear'] = le_mtd.fit_transform(df['MiscTechDesireNextYear'])
df['MiscTechDesireNextYear'].unique()
le_ug = LabelEncoder()
df['UndergradMajor'] = le_ug.fit_transform(df['UndergradMajor'])
df['UndergradMajor'].unique()
le_years = LabelEncoder()
df['YearsCode'] = le_years.fit_transform(df['YearsCode'])
df['YearsCode'].unique()

array([ 8, 48,  1, 47, 44, 49, 12, 33,  7, 46, 37, 26, 16, 13,  4, 30, 10,
       22, 19, 23,  2,  6,  5, 14, 20,  9, 38, 17,  3, 34, 36, 11, 27, 25,
        0, 28, 15, 18, 29, 31, 21, 51, 24, 41, 32, 35, 42, 50, 39, 43, 45,
       40])

In [28]:
attributes_matrix = df.drop('Salary', axis=1).values

In [29]:
column_name = 'Salary'
column_number = df.columns.get_loc(column_name)
print("Column number:", column_number)

Column number: 3


In [30]:
decision_attribute = 3

In [31]:
print(attributes_matrix.shape)

(17651, 13)


In [32]:
attributes_matrix

array([[24,  2,  2, ..., 16,  1,  8],
       [ 6,  4,  2, ..., 10,  6, 48],
       [ 8,  3,  2, ...,  9,  1,  1],
       ...,
       [20,  2,  2, ..., 13,  1, 10],
       [21,  3,  2, ...,  9,  1, 12],
       [ 6,  7,  0, ...,  5,  1, 44]], dtype=int64)

In [33]:
def fuzzy_similarity(x, y):
    distance = np.linalg.norm(x - y)  # Calculate the Euclidean distance
    similarity = 1 / (1 + distance)  # Calculate the fuzzy similarity
    return similarity

def calculate_positive_region(attributes, decision_attribute):
    positive_region = set()

    for obj in attributes:
        if obj[decision_attribute] > 104141.83:
            positive_region.add(tuple(obj))
    
    return positive_region

def calculate_significance(attributes, positive_region, attribute):
    significance = 0

    for obj in positive_region:
        # Calculate the similarity between obj and attribute using fuzzy_similarity
        similarity = fuzzy_similarity(obj, attribute)
        significance += similarity
    
    return significance

def quick_reduct_fuzzy(attributes, decision_attribute):
    reduct = set()  # Initialize an empty set for the reduct
    positive_region = calculate_positive_region(attributes, decision_attribute)
    candidate = set(range(len(attributes[0])))  # Initialize the Candidate set with all attributes
    
    while candidate:
        max_significance = -1
        max_attribute = None
        
        for attr in candidate:
            temp_reduct = reduct | {attr}
            temp_significance = calculate_significance(attributes, positive_region, attributes[:, attr])
            
            if temp_significance > max_significance:
                max_significance = temp_significance
                max_attribute = attr
        
        if max_attribute is not None:
            reduct.add(max_attribute)
            candidate.remove(max_attribute)
    
    return reduct

reduct = quick_reduct_fuzzy(attributes_matrix, decision_attribute)
print("Reduct:", reduct)

Reduct: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}


In [34]:
r = list(reduct)
reduct_column_names = df.columns[r]
print("Reduct Column Names:", reduct_column_names)

Reduct Column Names: Index(['Age', 'Age1stCode', 'CompFreq', 'Salary', 'Country', 'EdLevel',
       'Employment', 'Gender', 'LanguageWorkedWith', 'LanguageDesireNextYear',
       'MiscTechWorkedWith', 'MiscTechDesireNextYear', 'UndergradMajor'],
      dtype='object')


In [35]:
df = df.drop("YearsCode", axis = 1)

In [36]:
x = df.drop("Salary", axis = 1)
y = df["Salary"]

In [37]:
x

Unnamed: 0,Age,Age1stCode,CompFreq,Country,EdLevel,Employment,Gender,LanguageWorkedWith,LanguageDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,UndergradMajor
7,24,2,2,9,0,0,0,15,10,2,16,1
9,6,4,2,8,2,0,0,7,7,10,10,6
10,8,3,2,8,0,0,0,3,6,9,9,1
11,37,32,0,7,1,0,0,7,7,0,0,6
13,13,3,2,9,1,0,0,7,7,9,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
62212,17,11,0,7,0,0,0,3,0,0,0,0
63083,11,2,0,7,0,2,0,7,7,1,9,1
63402,20,2,2,9,1,0,0,1,1,1,13,1
63517,21,3,2,3,3,0,2,0,6,10,9,1


In [38]:
y

7        116000.0
9         32315.0
10        40070.0
11        14268.0
13        66000.0
           ...   
62212     38724.0
63083     45600.0
63402     74500.0
63517     59454.0
63955      9612.0
Name: Salary, Length: 17651, dtype: float64

In [39]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(x, y.values)

In [40]:
y_pred = dec_tree_reg.predict(x)

In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [42]:
error = np.sqrt(mean_squared_error(y, y_pred))
error

9621.734786299057

In [43]:
reduct

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

In [44]:
r = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [45]:
df.columns[r]

Index(['Age', 'Age1stCode', 'CompFreq', 'Salary', 'Country', 'EdLevel',
       'Employment', 'Gender', 'LanguageWorkedWith', 'LanguageDesireNextYear',
       'MiscTechWorkedWith', 'MiscTechDesireNextYear', 'UndergradMajor'],
      dtype='object')

In [46]:
df['Employment'].unique()

array([0, 2, 1])

In [47]:
z = np.array([[25, 18, "Yearly", "India", "Bachelor’s degree", 
               "Employed full-time", "Man", "Python", "HTML/CSS", ".NET", 
               "React Native", "Computer science"]])
z

array([['25', '18', 'Yearly', 'India', 'Bachelor’s degree',
        'Employed full-time', 'Man', 'Python', 'HTML/CSS', '.NET',
        'React Native', 'Computer science']], dtype='<U18')

In [48]:
z[:, 0] = le_age.transform(z[:,0])
z[:, 1] = le_age1.transform(z[:,1])
z[:, 2] = le_freq.transform(z[:,2])
z[:, 3] = le_country.transform(z[:,3])
z[:, 4] = le_ed.transform(z[:,4])
z[:, 5] = le_employ.transform(z[:,5])
z[:, 6] = le_gender.transform(z[:,6])
z[:, 7] = le_lww.transform(z[:,7])
z[:, 8] = le_ld.transform(z[:,8])
z[:, 9] = le_mt.transform(z[:,9])
z[:, 10] = le_mtd.transform(z[:,10])
z[:, 11] = le_ug.transform(z[:,11])
z = z.astype(int)
z

array([[11,  8,  2,  5,  0,  0,  0, 15,  7,  0, 12,  1]])

In [49]:
y_pred = dec_tree_reg.predict(z)
y_pred



array([27923.])

In [50]:
import pickle

In [51]:
data = {"model": dec_tree_reg, "le_age" : le_age, "le_age1" : le_age1, "le_freq" : le_freq, "le_country" : 
        le_country, "le_ed" : le_ed, "le_employ" : le_employ, "le_gender" : le_gender, "le_lww" : le_lww, "le_ld" : 
        le_ld, "le_mt" : le_mt, "le_mtd" : le_mtd, "le_ug" : le_ug}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [52]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_age = data["le_age"]
le_age1 = data["le_age1"]
le_freq = data["le_freq"]
le_country = data["le_country"]
le_ed = data["le_ed"]
le_employ = data["le_employ"]
le_gender = data["le_gender"]
le_lww = data["le_lww"]
le_ld = data["le_ld"]
le_mt = data["le_mt"]
le_mtd = data["le_mtd"]
le_ug = data["le_ug"]

In [53]:
y_pred = regressor_loaded.predict(x)
y_pred

array([116000.,  32315.,  40070., ...,  74500.,  59454.,   9612.])

In [54]:
y_pred = regressor_loaded.predict(z)
y_pred



array([27923.])