In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
data=pd.read_csv('1.csv')

In [3]:
data

Unnamed: 0,Years_in_CS,Programming_Languages,Frameworks,Databases,DevOps_Tools,Networking_Experience,Cybersecurity_Experience,AI_Data_Science_Experience,Current_Role,Happy_in_Current_Role,Current_Role_Difficulty
0,,Rust,.NET Core,CassandraDB,Git,Yes,Yes,No,Systems Analyst,No,No
1,1-3 years,C,CryEngine,Other (please specify),Other (please specify),Yes,Yes,Yes,Mobile Developer,Yes,Yes
2,Less than 1 year,Ruby,OpenVAS,OracleDB,CircleCI,No,Yes,Yes,Cybersecurity Analyst,No,No
3,,C#,Kali Linux,PostgreSQL,Terraform,Yes,Yes,Yes,Data Scientist,No,Yes
4,More than 5 years,PHP,Next.js,CassandraDB,Azure,No,Yes,No,DevOps Engineer,No,No
...,...,...,...,...,...,...,...,...,...,...,...
1995,3-5 years,Java,Snort,OracleDB,Jenkins,No,No,Yes,Product Manager,No,Yes
1996,More than 5 years,,Apache Spark,Other (please specify),DigitalOcean,No,No,Yes,Cybersecurity Analyst,No,Yes
1997,More than 5 years,PHP,TensorFlow,OracleDB,Docker,Yes,Yes,Yes,Data Engineer,No,Yes
1998,Less than 1 year,Swift,.NET Core,Other (please specify),Git,Yes,Yes,Yes,Mobile Developer,No,Yes


In [4]:
experience_mapping = {
    'None': 0,
    'Less than 1 year': 1,
    '1-3 years': 2,
    '3-5 years': 3,
    'More than 5 years': 4
}

In [5]:
data['Seniority_Score'] = data['Years_in_CS'].map(experience_mapping).fillna(0)


In [6]:
mlb = MultiLabelBinarizer()
data['Programming_Languages'] = data['Programming_Languages'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else x.split(",")))
languages_encoded = mlb.fit_transform(data['Programming_Languages'])
languages_data = pd.DataFrame(languages_encoded, columns=mlb.classes_)
data = data.join(languages_data)

In [7]:
data.columns

Index(['Years_in_CS', 'Programming_Languages', 'Frameworks', 'Databases',
       'DevOps_Tools', 'Networking_Experience', 'Cybersecurity_Experience',
       'AI_Data_Science_Experience', 'Current_Role', 'Happy_in_Current_Role',
       'Current_Role_Difficulty', 'Seniority_Score', 'C', 'C#', 'C++', 'Dart',
       'Go', 'Java', 'JavaScript', 'Kotlin', 'Other (please specify)', 'PHP',
       'Python', 'Ruby', 'Rust', 'Swift', 'TypeScript'],
      dtype='object')

In [8]:
data.drop(['Programming_Languages','Years_in_CS'],axis=1)

Unnamed: 0,Frameworks,Databases,DevOps_Tools,Networking_Experience,Cybersecurity_Experience,AI_Data_Science_Experience,Current_Role,Happy_in_Current_Role,Current_Role_Difficulty,Seniority_Score,...,Java,JavaScript,Kotlin,Other (please specify),PHP,Python,Ruby,Rust,Swift,TypeScript
0,.NET Core,CassandraDB,Git,Yes,Yes,No,Systems Analyst,No,No,0.0,...,0,0,0,0,0,0,0,1,0,0
1,CryEngine,Other (please specify),Other (please specify),Yes,Yes,Yes,Mobile Developer,Yes,Yes,2.0,...,0,0,0,0,0,0,0,0,0,0
2,OpenVAS,OracleDB,CircleCI,No,Yes,Yes,Cybersecurity Analyst,No,No,1.0,...,0,0,0,0,0,0,1,0,0,0
3,Kali Linux,PostgreSQL,Terraform,Yes,Yes,Yes,Data Scientist,No,Yes,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Next.js,CassandraDB,Azure,No,Yes,No,DevOps Engineer,No,No,4.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Snort,OracleDB,Jenkins,No,No,Yes,Product Manager,No,Yes,3.0,...,1,0,0,0,0,0,0,0,0,0
1996,Apache Spark,Other (please specify),DigitalOcean,No,No,Yes,Cybersecurity Analyst,No,Yes,4.0,...,0,0,0,0,0,0,0,0,0,0
1997,TensorFlow,OracleDB,Docker,Yes,Yes,Yes,Data Engineer,No,Yes,4.0,...,0,0,0,0,1,0,0,0,0,0
1998,.NET Core,Other (please specify),Git,Yes,Yes,Yes,Mobile Developer,No,Yes,1.0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
data['Frameworks'] = data['Frameworks'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else x.split(",")))

In [10]:
# Encode the 'Frameworks' column
Frameworks_encoded = mlb.fit_transform(data['Frameworks'])

# Create a DataFrame from the encoded 'Frameworks' data
Frameworks_data = pd.DataFrame(Frameworks_encoded, columns=mlb.classes_)

# Join the new 'Frameworks_data' DataFrame to the original data
data = data.join(Frameworks_data)

In [11]:
data['Databases'] = data['Databases'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else x.split(",")))

In [12]:
# Encode the 'Databases' column
Databases_encoded = mlb.fit_transform(data['Databases'])

# Create a DataFrame from the encoded 'Databases' data with a suffix to avoid overlap
Databases_data = pd.DataFrame(Databases_encoded, columns=[f"{col}_db" for col in mlb.classes_])

# Join the new 'Databases_data' DataFrame to the original data
data = data.join(Databases_data)


In [13]:
data

Unnamed: 0,Years_in_CS,Programming_Languages,Frameworks,Databases,DevOps_Tools,Networking_Experience,Cybersecurity_Experience,AI_Data_Science_Experience,Current_Role,Happy_in_Current_Role,...,Xamarin,CassandraDB_db,Microsoft SQL Server_db,MongoDB_db,MySQL_db,OracleDB_db,Other (please specify)_db,PostgreSQL_db,Redis_db,SQLite_db
0,,[Rust],[.NET Core],[CassandraDB],Git,Yes,Yes,No,Systems Analyst,No,...,0,1,0,0,0,0,0,0,0,0
1,1-3 years,[C],[CryEngine],[Other (please specify)],Other (please specify),Yes,Yes,Yes,Mobile Developer,Yes,...,0,0,0,0,0,0,1,0,0,0
2,Less than 1 year,[Ruby],[OpenVAS],[OracleDB],CircleCI,No,Yes,Yes,Cybersecurity Analyst,No,...,0,0,0,0,0,1,0,0,0,0
3,,[C#],[Kali Linux],[PostgreSQL],Terraform,Yes,Yes,Yes,Data Scientist,No,...,0,0,0,0,0,0,0,1,0,0
4,More than 5 years,[PHP],[Next.js],[CassandraDB],Azure,No,Yes,No,DevOps Engineer,No,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,3-5 years,[Java],[Snort],[OracleDB],Jenkins,No,No,Yes,Product Manager,No,...,0,0,0,0,0,1,0,0,0,0
1996,More than 5 years,[],[Apache Spark],[Other (please specify)],DigitalOcean,No,No,Yes,Cybersecurity Analyst,No,...,0,0,0,0,0,0,1,0,0,0
1997,More than 5 years,[PHP],[TensorFlow],[OracleDB],Docker,Yes,Yes,Yes,Data Engineer,No,...,0,0,0,0,0,1,0,0,0,0
1998,Less than 1 year,[Swift],[.NET Core],[Other (please specify)],Git,Yes,Yes,Yes,Mobile Developer,No,...,0,0,0,0,0,0,1,0,0,0


In [14]:
data['DevOps_Tools'] = data['DevOps_Tools'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else x.split(",")))

In [15]:
mlb = MultiLabelBinarizer()

# Encode the 'DevOps_Tools' column
DevOps_Tools_encoded = mlb.fit_transform(data['DevOps_Tools'])

# Create a DataFrame from the encoded 'DevOps_Tools' data with a unique suffix to avoid overlap
DevOps_Tools_data = pd.DataFrame(DevOps_Tools_encoded, columns=[f"{col}_devops" for col in mlb.classes_])

# Join the new 'DevOps_Tools_data' DataFrame to the original data
data = data.join(DevOps_Tools_data)

In [16]:
data

Unnamed: 0,Years_in_CS,Programming_Languages,Frameworks,Databases,DevOps_Tools,Networking_Experience,Cybersecurity_Experience,AI_Data_Science_Experience,Current_Role,Happy_in_Current_Role,...,Docker_devops,Git_devops,GitLab CI/CD_devops,Google Cloud Platform_devops,Jenkins_devops,Kubernetes_devops,Nginx_devops,Other (please specify)_devops,Terraform_devops,Travis CI_devops
0,,[Rust],[.NET Core],[CassandraDB],[Git],Yes,Yes,No,Systems Analyst,No,...,0,1,0,0,0,0,0,0,0,0
1,1-3 years,[C],[CryEngine],[Other (please specify)],[Other (please specify)],Yes,Yes,Yes,Mobile Developer,Yes,...,0,0,0,0,0,0,0,1,0,0
2,Less than 1 year,[Ruby],[OpenVAS],[OracleDB],[CircleCI],No,Yes,Yes,Cybersecurity Analyst,No,...,0,0,0,0,0,0,0,0,0,0
3,,[C#],[Kali Linux],[PostgreSQL],[Terraform],Yes,Yes,Yes,Data Scientist,No,...,0,0,0,0,0,0,0,0,1,0
4,More than 5 years,[PHP],[Next.js],[CassandraDB],[Azure],No,Yes,No,DevOps Engineer,No,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,3-5 years,[Java],[Snort],[OracleDB],[Jenkins],No,No,Yes,Product Manager,No,...,0,0,0,0,1,0,0,0,0,0
1996,More than 5 years,[],[Apache Spark],[Other (please specify)],[DigitalOcean],No,No,Yes,Cybersecurity Analyst,No,...,0,0,0,0,0,0,0,0,0,0
1997,More than 5 years,[PHP],[TensorFlow],[OracleDB],[Docker],Yes,Yes,Yes,Data Engineer,No,...,1,0,0,0,0,0,0,0,0,0
1998,Less than 1 year,[Swift],[.NET Core],[Other (please specify)],[Git],Yes,Yes,Yes,Mobile Developer,No,...,0,1,0,0,0,0,0,0,0,0


In [17]:
data['Networking_Experience']=data['Networking_Experience'].map({'Yes':1,'No':0})

In [18]:
data['Cybersecurity_Experience']=data['Cybersecurity_Experience'].map({'Yes':1,'No':0})

In [19]:
data['AI_Data_Science_Experience']=data['AI_Data_Science_Experience'].map({'Yes':1,'No':0})

In [35]:
data['Happy_in_Current_Role']=data['Happy_in_Current_Role'].map({'Yes':1,'No':0})

data['Current_Role_Difficulty']=data['Current_Role_Difficulty'].map({'Yes':1,'No':0})

In [21]:
role_mapping = {role: idx for idx, role in enumerate(data['Current_Role'].unique())}

# Apply the mapping to the 'Current_Role' column using .map()
data['Current_Role'] = data['Current_Role'].map(role_mapping)



In [22]:
data

Unnamed: 0,Years_in_CS,Programming_Languages,Frameworks,Databases,DevOps_Tools,Networking_Experience,Cybersecurity_Experience,AI_Data_Science_Experience,Current_Role,Happy_in_Current_Role,...,Docker_devops,Git_devops,GitLab CI/CD_devops,Google Cloud Platform_devops,Jenkins_devops,Kubernetes_devops,Nginx_devops,Other (please specify)_devops,Terraform_devops,Travis CI_devops
0,,[Rust],[.NET Core],[CassandraDB],[Git],1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1-3 years,[C],[CryEngine],[Other (please specify)],[Other (please specify)],1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,Less than 1 year,[Ruby],[OpenVAS],[OracleDB],[CircleCI],0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
3,,[C#],[Kali Linux],[PostgreSQL],[Terraform],1,1,1,3,0,...,0,0,0,0,0,0,0,0,1,0
4,More than 5 years,[PHP],[Next.js],[CassandraDB],[Azure],0,1,0,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,3-5 years,[Java],[Snort],[OracleDB],[Jenkins],0,0,1,8,0,...,0,0,0,0,1,0,0,0,0,0
1996,More than 5 years,[],[Apache Spark],[Other (please specify)],[DigitalOcean],0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
1997,More than 5 years,[PHP],[TensorFlow],[OracleDB],[Docker],1,1,1,20,0,...,1,0,0,0,0,0,0,0,0,0
1998,Less than 1 year,[Swift],[.NET Core],[Other (please specify)],[Git],1,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
clean=data.drop(['Programming_Languages','Years_in_CS','Frameworks','Databases','DevOps_Tools'],axis=1)

In [37]:
x=data.drop(['Current_Role', 'Years_in_CS','Programming_Languages','Frameworks','Databases','DevOps_Tools'],axis=1)
y=data['Current_Role']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [49]:
X_train.shape

(1600, 85)

In [40]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [48]:

# Assuming y_test and y_pred are encoded with the same LabelEncoder
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.0575
