In [19]:
from pypdf import PdfReader
import re

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

In [3]:
pdf = PdfReader("./PawanKumar_Resume.pdf")
page = pdf.pages[0]

In [4]:
texts = page.extract_text()

In [6]:
# atext = []
# for text in texts:
#     atext.append(text)

# print("\n".join(atext))

In [10]:
text = re.sub(r"\s+", " ", texts).strip()
# print(text.strip())

In [12]:
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
textList = [s.strip() for s in sentences if s.strip()]

In [20]:
# lines = sentences.splitlines()
# bullets = []
# for line in lines:
#     stripped = line.strip()
#     if stripped.startswith(("-", "•", "*")):
#         bullets.append(stripped.lstrip("-•* ").strip())
#     print(bullets)


In [10]:
train_csv = pd.read_csv("dataset/resume_suggestions_train.csv")
test_csv = pd.read_csv("dataset/resume_suggestions_test.csv")

In [11]:
train_csv.head()

Unnamed: 0,suggestion,label
0,Evaluators often find that Keywords relevant t...,keyword_match
1,Generic phrasing reduces the impact of your me...,weak_phrases
2,Recruiters may think that Your resume could be...,keyword_match
3,it appears that the structure of your resume c...,organization
4,It appears that The resume may be too long for...,length_long


In [12]:
test_csv.head()

Unnamed: 0,suggestion,label
0,It seems that Action-driven phrasing is limite...,action_verbs
1,You could focus on improving this area: The re...,length_long
2,Consider improving this area: Some details app...,length_long
3,Evaluators often find that Your achievements d...,metrics
4,Few accomplishments include quantitative resul...,metrics


In [29]:
X_train, y_train = train_csv["suggestion"], train_csv["label"]
X_test, y_test = test_csv["suggestion"], test_csv["label"]

In [30]:
mlb = MultiLabelBinarizer()

In [31]:
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [32]:
y_train, y_test

(array([[1, 1, 0, ..., 1, 1, 0],
        [1, 1, 0, ..., 1, 0, 0],
        [1, 1, 0, ..., 1, 1, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 1, 0, 0],
        [0, 1, 0, ..., 0, 0, 1]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 1, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0]]))

In [36]:
model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", OneVsRestClassifier(LogisticRegression(max_iter=300)))
])

In [37]:
# model = LogisticRegression()
model.fit(X_train, y_train)

In [40]:
preds = model.predict(X_test)
print(classification_report(y_test, preds, target_names=mlb.classes_))

              precision    recall  f1-score   support

           _       1.00      1.00      1.00      1791
           a       1.00      1.00      1.00      1573
           b       1.00      1.00      1.00       397
           c       1.00      1.00      1.00      1174
           d       1.00      1.00      1.00       287
           e       1.00      1.00      1.00      2098
           f       1.00      1.00      1.00       320
           g       1.00      1.00      1.00      1622
           h       1.00      1.00      1.00      1211
           i       1.00      1.00      1.00      1589
           k       1.00      1.00      1.00       567
           l       1.00      1.00      1.00       644
           m       1.00      1.00      1.00      1283
           n       1.00      1.00      1.00      1833
           o       1.00      1.00      1.00      2213
           p       1.00      1.00      1.00       280
           r       1.00      1.00      1.00      2104
           s       1.00    

In [41]:
path = "112221kjhScioj2ufh.pdf"
path.split(".")[1]

'pdf'