Using Logistic Regression to Train the model

In [7]:
!pip install pandas scikit-learn nltk





In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from google.colab import files
uploaded = files.upload()



Saving occ_classification_50k_sample.csv to occ_classification_50k_sample (8).csv


In [11]:
data = pd.read_csv('occ_classification_50k_sample.csv')


In [13]:
# preprocessing function
def preprocess_text(text):
    # missing or non-string values
    if not isinstance(text, str):
        return ""


    text = text.lower()


    tokens = word_tokenize(text)

    # Removing stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]


    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # tokens back into a single string
    return ' '.join(tokens)


data['processed_text'] = data['title'].fillna('') + ' ' + data['description'].fillna('')
data['processed_text'] = data['processed_text'].apply(preprocess_text)


In [14]:
print(data[['title', 'description', 'processed_text']].head())


                          title  \
0             digital marketing   
1            accounts and audit   
2          software engineering   
3                staff engineer   
4  project assistant - research   

                                         description  \
0  about the internship selected interns day-to-d...   
1  who can apply . are available for full time in...   
2  job description ,wm technology - developer - s...   
3  summary do you want to work on some of the mos...   
4  indian institute of technology kharagpur iit k...   

                                      processed_text  
0  digital marketing internship selected intern r...  
1  account audit apply available full time intern...  
2  software engineering job description wm techno...  
3  staff engineer summary want work exciting aspe...  
4  project assistant research indian institute te...  


In [15]:
from sklearn.model_selection import train_test_split

# Splitting data into features (X) and target labels (y)
X = data['processed_text']  # Preprocessed job descriptions
y = data['SOC2']  # SOC2 categories

# Data Splitting into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))


Training set size: 40000
Test set size: 10000


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF Vectorizer with n-grams (unigrams, bigrams, trigrams)
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)  # Adjust max_features as needed

# Fitting the vectorizer on training data and transform both training and test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF feature matrix shape:", X_train_tfidf.shape)


TF-IDF feature matrix shape: (40000, 5000)


In [17]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(multi_class='ovr', max_iter=1000)

model.fit(X_train_tfidf, y_train)

print("Model training complete!")




Model training complete!


Accuracy Testing

In [18]:
from sklearn.metrics import classification_report, accuracy_score


y_pred = model.predict(X_test_tfidf)


print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


              precision    recall  f1-score   support

          11       0.63      0.69      0.65      1507
          13       0.56      0.49      0.52       948
          15       0.82      0.93      0.87      3055
          17       0.71      0.66      0.68       545
          19       0.54      0.48      0.51       140
          21       0.62      0.37      0.46       214
          23       0.64      0.28      0.39        25
          25       0.74      0.39      0.51       101
          27       0.83      0.80      0.82       527
          29       0.74      0.60      0.66       130
          31       0.00      0.00      0.00        22
          33       0.80      0.50      0.62        16
          35       1.00      0.09      0.16        23
          37       0.00      0.00      0.00        10
          39       0.00      0.00      0.00        33
          41       0.63      0.49      0.55       695
          43       0.71      0.84      0.77      1517
          45       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving the file as csv

In [20]:
import pandas as pd


test_data = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
test_data['true_label'] = y_test
test_data['predicted_label'] = y_pred



In [21]:

test_data.to_csv('final_processed_dataset.csv', index=False)


In [23]:
from google.colab import files


test_data.to_csv('final_processed_dataset.csv', index=False)


files.download('final_processed_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>