In [943]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

## Preparing the dataset:

In [944]:
df = pd.read_csv('datasets/occupation_pictures.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job Title           658 non-null    object 
 1   Pic Title           658 non-null    object 
 2   Num Resumes         658 non-null    int64  
 3   Google Position     658 non-null    int64  
 4   Strict Face Count   658 non-null    int64  
 5   Relaxed Face Count  658 non-null    int64  
 6   KB size             658 non-null    float64
 7   Height              658 non-null    int64  
 8   Width               658 non-null    int64  
 9   Resolution          658 non-null    int64  
 10  Text Regions        658 non-null    int64  
 11  Picture URL         658 non-null    object 
 12  Manual Label        133 non-null    object 
dtypes: float64(1), int64(8), object(4)
memory usage: 67.0+ KB


The "Job Title" and "Pic Title" categories are useful when we check if the Pic Title have the words present in the Job Title. For that, I will create a new category "JobTitle_in_PicTitle" that has the % of 'Job Title' words that are present in "Pic Title"

In [945]:
#Function that process the text of each sentence for comparison
lemmatizer = WordNetLemmatizer()
def pre_process_text(title):
    title = title.lower() # convert to lower case.
    stopset = stopwords.words('english') + list(string.punctuation) # collecting a list of stop words from nltk and punctuation form
    return [lemmatizer.lemmatize(i) for i in word_tokenize(title) if i not in stopset] #also lemmatize so similar words can be recognized

#Function that return the % of 'Job Title' words that are present in "Pic Title"
def found_percentage(job_title,pic_title):
    pic_title = pre_process_text(pic_title)
    job_title = pre_process_text(job_title)

    found = 0.
    for word in job_title:
        if word in pic_title:
            found += 1
    return found/len(job_title)

#Function that return True if all words of 'Job Title' are present in "Pic Title" -> This works worse than the other, so I will not use it.
def found(job_title,pic_title):
    pic_title = pre_process_text(pic_title)
    job_title = pre_process_text(job_title)

    for word in job_title:
        if word not in pic_title:
            return False
    return True

#Creates a new column with JobTitle_in_PicTitle
df['JobTitle_in_PicTitle'] = [found_percentage(row['Job Title'],row['Pic Title']) for index, row in df.iterrows()]

## Classification model

In [946]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

#model choice, RFC = RandomForestClassifier
model_name = 'RFC'

In [947]:
#The training dataset will be just the rows with Manual Label nonNan
df_training = df.dropna(axis=0,how='any')

For a first analysis, the "Google Position", "Num Resumes", "Relaxed Face Count" and "Picture URL" will not be used because by the labeled pictures examples, this categories can be misleading or not add useful information for the labeling.

In [948]:
y = df_training['Manual Label'].map({'good':1,'bad':0}) #changing good = 1 and bad = 0
training_features = ["Strict Face Count","KB size", "Height", "Width", "Resolution","Text Regions","JobTitle_in_PicTitle"]
X = df_training[training_features]

In [949]:
print(y.value_counts()) 

1    91
0    42
Name: Manual Label, dtype: int64


In [950]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 0)


if model_name == 'RFC':
    model = RandomForestClassifier(random_state=0)  #classifier: constructs a lot of decision trees and uses the class selected by most trees
    hyper_params = {
        'model__n_estimators': [3,10],
        'model__max_depth': [None,2,3],
        'model__min_samples_split':[2,3]
    }


#defining the pipeline steps
pipeline = imbpipeline(steps=[
    #('scaler', StandardScaler()),                                     #transform: standart normalization of values (mean = 0, std = 1)
    ('smote', SMOTE(random_state=0)), #Transformer to deal with the target unbalance
    ('model', model)
                        ])

# grid search definition
grid = GridSearchCV(
    pipeline,
    param_grid = hyper_params,
    scoring = 'f1',   #balance between precision (true positives compared to all the positives) and recall(trues found compared to the total trues)
    cv = 10,
    n_jobs = 1)

In [951]:
grid = grid.fit(X_train, y_train)
y_test_predict = grid.predict(X_test)

print("Best params",grid.best_params_)
print("Best score",grid.best_score_)

Best params {'model__max_depth': 3, 'model__min_samples_split': 2, 'model__n_estimators': 3}
Best score 0.7402120102120102


### Checking score

In [952]:
matrix = confusion_matrix(y_test, y_test_predict) #rows = true label, columns = predicted labels
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
print(matrix)
print('True negative:',round(matrix[0][0],2))
print('False positive:',round(matrix[0][1],2))
print('False negative:',round(matrix[1][0],2))
print('True positive:',round(matrix[1][1],2))


[[0.72727273 0.27272727]
 [0.26086957 0.73913043]]
True negative: 0.73
False positive: 0.27
False negative: 0.26
True positive: 0.74


My evaluation: I think that the model could be improved, mainly to reduce the occurrence of "False positive". Because this is the worst type of error for this problem (it is better to miss good images than to consider as good an image that is bad). For example, the "Text Regions" feature is important for considering an image "bad" but with this model it is not influencing so much the result. I tried changing some other things and testing other models but couldn't solve it in the 2h/3h I'm supposed to finish.



## Predicting labels

In [953]:
df['N'] = grid.predict(df[training_features])
df['N'] = df['N'].map({1:'good',0:'bad'}) #changing good = 1 and bad = 0
final_df = df.drop(['JobTitle_in_PicTitle'],axis=1)
final_df.to_csv('datasets/occupation_pictures_result.csv',index=False)

In [954]:
final_df

Unnamed: 0,Job Title,Pic Title,Num Resumes,Google Position,Strict Face Count,Relaxed Face Count,KB size,Height,Width,Resolution,Text Regions,Picture URL,Manual Label,N
0,3D Modeler,Our Top 19 3D Modeling Software Picks ...,1261,0,0,7,276.763672,1917,2301,4411017,2,https://format-magazine-production-res.cloudin...,,good
1,3D Modeler,7 Tips for Beginner 3D Modelers ...,1261,1,0,3,44.343750,340,747,253980,5,http://static1.squarespace.com/static/5320f582...,,bad
2,3D Modeler,How to Become an Expert 3D Modeler ...,1261,2,0,2,43.088867,380,660,250800,8,https://www.gamedesigning.org/wp-content/uploa...,,bad
3,3D Modeler,What is 3D Modeling & What's It Used For?,1261,3,0,2,78.623047,330,750,247500,1,https://cdn.conceptartempire.com/images/04/522...,,good
4,3D Modeler,Shoofping 3D Modeler ...,1261,4,0,1,97.517578,1075,1909,2052175,3,http://shoofping.com/wp-content/uploads/2017/1...,,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,Wind Turbine Engineer,Wind turbine technician fastest-growing ...,61,4,0,0,89.249023,720,1280,921600,0,https://i.ytimg.com/vi/lUf2mOYp1d4/maxresdefau...,good,good
654,Wind Turbine Engineer,How to Become a Wind Energy Engineer ...,61,5,0,0,37.949219,200,300,60000,0,https://www.environmentalscience.org/wp-conten...,,bad
655,Wind Turbine Engineer,Wind - Get Into Energy,61,6,0,1,324.730469,500,750,375000,1,http://getintoenergy.com/wp-content/uploads/20...,good,good
656,Wind Turbine Engineer,Wind-energy programs yield engineering ...,61,7,0,1,36.197266,920,613,563960,0,https://s.hdnux.com/photos/20/15/22/4248747/3/...,good,good
