## Import and setup

In [1]:
#supervised learning
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
#base
import pandas as pd
import numpy as np
import pickle

In [3]:
data = pd.read_csv('cleaned.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2186 entries, 0 to 2185
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   bando        2186 non-null   object
 1   piattaforma  2185 non-null   object
 2   commento_p   2180 non-null   object
 3   target       2186 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 68.4+ KB


Unnamed: 0,bando,piattaforma,commento_p,target
0,faber i ed,bandi online,troppe informazioni ripetere moduli facilmente...,0
1,e di nuovo sport,bandi online,facile comprensibile utilizzo,1
2,smaltimento amianto anno 2019,bandi online,non non_ricevuto non_risposta non_richiesta no...,0
3,musei 2016,siage,non non_automatismi non_sistema non_funzionano...,0
4,rinnova autoveicoli,bandi online,non non_chiari non_percorsi non_operare non_pa...,0


## Supervised text classification

In [4]:
data.fillna(value = '', inplace=True)

In [5]:
pipeline = Pipeline([
                     ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
                     ("clf", LogisticRegression())
])

In [6]:
score = cross_validate(pipeline, X = data['commento_p'], y = data['target'])
print("Mean accuracy: ", score['test_score'].mean())

Mean accuracy:  0.7548101940378044


### Grid search

In [7]:
grid={"clf__C":np.logspace(-3,3,7), "clf__penalty":["none","l2"]}# l1 lasso l2 ridge
logreg_cv=GridSearchCV(pipeline,grid,cv=10)
logreg_cv.fit(data['commento_p'],data['target'])

print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio

tuned hyperparameters :(best parameters)  {'clf__C': 10.0, 'clf__penalty': 'l2'}
accuracy : 0.7749214528088476


Final train and pickle out

In [9]:
pipeline = Pipeline([
                     ("tfidf", TfidfVectorizer(ngram_range = (1, 2))),
                     ("clf", LogisticRegression(C = 10))
])
pipeline.fit(data['commento_p'], data['target'])
with open('final_pipeline.pickle', 'wb') as file:
  pickle.dump(pipeline, file)