In [1]:
#############################################################################
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
#################################################################################


#Only WORDS
import pandas as pd
X= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["nb Mots"],nrows=200)
X=np.asarray(X, dtype=np.int).reshape(-1,1)
Y= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["Entités Nommées (Nom propres)"],nrows=200) #until row 151, so 150 values
Y=np.asarray(Y, dtype=np.int).reshape(-1,)


from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, Y)
print("log_reg.score :  "+str(log_reg.score(X,Y)))

#Test
from random import choices
X_new = choices(range(20, 80),k=250000)
X_new=np.asarray(X_new, dtype=np.int).reshape(-1,1)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
print(y_proba)
print(decision_boundary)
print(log_reg.predict_proba([[37],[10],[300]]))





#WORDS + QUOTES
import pandas as pd
X= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["nb Mots","nb Quotes"],nrows=150)
X=np.asarray(X, dtype=np.int).reshape(-1,2)
Y= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["Entités Nommées (Nom propres)"],nrows=150) #until row 151, so 150 values
Y=np.asarray(Y, dtype=np.int).reshape(-1,)
print(X.shape)
print(Y.shape) 

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, Y)
print("log_reg.score(X,Y) :  "+str(log_reg.score(X,Y)))

from random import choices
X_newQ=choices([0,1,2],k=60)
X_newQ=np.asarray(X_newQ, dtype=np.int).reshape(-1,1)

X_newW=np.arange(20, 80)
X_newW=np.asarray(X_newW, dtype=np.int).reshape(-1,1)  #(60,1)

X_new=np.array(list(zip(X_newQ, X_newW))).reshape(-1,2)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
print(y_proba)
print(decision_boundary)
print(log_reg.predict_proba([[2,37],[1,10],[6,300],[1,69]]))






#test modele Words+Quotes sur les 200 phrases :
from sklearn.metrics import accuracy_score

# Mes 200 Phrases
X_phrasesWordsQuotes= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["nb Mots","nb Quotes"],nrows=200)
X_phrasesWordsQuotes=np.asarray(X_phrasesWordsQuotes, dtype=np.int).reshape(-1,2)

#On teste la proba (0 ou 1) sur nos 200 phrases
y_probaWordsQuotes = log_reg.predict(X_phrasesWordsQuotes)  #log_reg.predict_proba pour avoir la proba de chacun
print("y_probaWordsQuotes : ")
print(y_probaWordsQuotes)

#y_true est le tableau avec les vraies reponses d'entites nommes
y_true=pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["Entités Nommées (Nom propres)"],nrows=200)
y_true=np.asarray(y_true, dtype=np.int).reshape(-1,)
print("y_true : ")
print(y_true)


count_a,count_b=0,0
for i in range(len(y_true)):
    if(y_true[i]==1):
        count_a=count_a+1
    if(y_probaWordsQuotes[i]==1):
        count_b=count_b+1
print("accuracy_score : "+str(accuracy_score(y_true,y_probaWordsQuotes)))  # Score est élevé grace aux nombreux 0 ....

E2=count_b/count_a
print("E2= nb  Reponses 'OUI' Détectes/Nb phrases ayant une Entité Nommée : "+str(E2))  #Score très faible pour les 1 donc ..

# accuracy_score compte le nb de reponses identiques 
E1= accuracy_score(y_true,y_probaWordsQuotes,normalize=False) /200  #160/200=0.8
print("E1=nb Vraies Bonnes reponses / nb Phrases Totales : " + str(E1))



log_reg.score :  0.81
[[0.61903279 0.38096721]
 [0.7062817  0.2937183 ]
 [0.67587227 0.32412773]
 ...
 [0.81504502 0.18495498]
 [0.48774502 0.51225498]
 [0.76818266 0.23181734]]
[79]
[[7.20844867e-01 2.79155133e-01]
 [8.71098965e-01 1.28901035e-01]
 [2.19823471e-04 9.99780177e-01]]
(150, 2)
(150,)
log_reg.score(X,Y) :  0.7533333333333333
[[0.81269997 0.18730003]
 [0.46469236 0.53530764]
 [0.65262124 0.34737876]
 [0.4539276  0.5460724 ]
 [0.44856069 0.55143931]
 [0.63773791 0.36226209]
 [0.79209608 0.20790392]
 [0.43253654 0.56746346]
 [0.62258568 0.37741432]
 [0.78118492 0.21881508]
 [0.41665231 0.58334769]
 [0.77368501 0.22631499]
 [0.60200875 0.39799125]
 [0.59680451 0.40319549]
 [0.76209781 0.23790219]
 [0.3905738  0.6094262 ]
 [0.58106486 0.41893514]
 [0.75010896 0.24989104]
 [0.7460243  0.2539757 ]
 [0.74189585 0.25810415]
 [0.55982469 0.44017531]
 [0.3600967  0.6399033 ]
 [0.72925063 0.27074937]
 [0.72495004 0.27504996]
 [0.53836318 0.46163682]
 [0.71622285 0.28377715]
 [0.711797

In [2]:
import pandas as pd
X= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["nb Mots"],nrows=200)
X=np.asarray(X, dtype=np.int).reshape(-1,1)
Y= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["Entités Nommées (Nom propres)"],nrows=200) #until row 151, so 150 values
Y=np.asarray(Y, dtype=np.int).reshape(-1,)

In [3]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, Y)
log_reg.score(X,Y)

0.81

In [4]:
from random import choices
X_new = choices(range(20, 80),k=250000)
X_new=np.asarray(X_new, dtype=np.int).reshape(-1,1)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
print(y_proba)
print(decision_boundary)
print(log_reg.predict_proba([[37],[10],[300]]))

[[0.66801809 0.33198191]
 [0.46996296 0.53003704]
 [0.64390184 0.35609816]
 ...
 [0.36635809 0.63364191]
 [0.81504502 0.18495498]
 [0.7486044  0.2513956 ]]
[67]
[[7.20844867e-01 2.79155133e-01]
 [8.71098965e-01 1.28901035e-01]
 [2.19823471e-04 9.99780177e-01]]


In [5]:
#WORDS + QUOTES
import pandas as pd
X= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["nb Mots","nb Quotes"],nrows=150)
X=np.asarray(X, dtype=np.int).reshape(-1,2)
Y= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",usecols = ["Entités Nommées (Nom propres)"],nrows=150) #until row 151, so 150 values
Y=np.asarray(Y, dtype=np.int).reshape(-1,)
print(X.shape)
print(Y.shape) 

(150, 2)
(150,)


In [6]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, Y)
log_reg.score(X,Y)

0.7533333333333333

In [7]:
from random import choices
X_newQ=choices([0,1,2],k=60)
X_newQ=np.asarray(X_newQ, dtype=np.int).reshape(-1,1)

X_newW=np.arange(20, 80)
X_newW=np.asarray(X_newW, dtype=np.int).reshape(-1,1)  #(60,1)

X_new=np.array(list(zip(X_newQ, X_newW))).reshape(-1,2)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
print(y_proba)
print(decision_boundary)
print(log_reg.predict_proba([[2,37],[1,10],[6,300],[1,69]]))

[[0.81269997 0.18730003]
 [0.80937841 0.19062159]
 [0.45930523 0.54069477]
 [0.80260065 0.19739935]
 [0.79914425 0.20085575]
 [0.63773791 0.36226209]
 [0.79209608 0.20790392]
 [0.62766477 0.37233523]
 [0.42722468 0.57277532]
 [0.78118492 0.21881508]
 [0.41665231 0.58334769]
 [0.60719004 0.39280996]
 [0.40615604 0.59384396]
 [0.76600505 0.23399495]
 [0.76209781 0.23790219]
 [0.75814593 0.24185407]
 [0.58106486 0.41893514]
 [0.75010896 0.24989104]
 [0.7460243  0.2539757 ]
 [0.56515847 0.43484153]
 [0.55982469 0.44017531]
 [0.73350869 0.26649131]
 [0.54911678 0.45088322]
 [0.72495004 0.27504996]
 [0.72060731 0.27939269]
 [0.34037127 0.65962873]
 [0.33552204 0.66447796]
 [0.33070725 0.66929275]
 [0.70282367 0.29717633]
 [0.51134438 0.48865562]
 [0.31647699 0.68352301]
 [0.68906647 0.31093353]
 [0.68440383 0.31559617]
 [0.30258223 0.69741777]
 [0.48425912 0.51574088]
 [0.67019406 0.32980594]
 [0.66538584 0.33461416]
 [0.4680421  0.5319579 ]
 [0.65566639 0.34433361]
 [0.45726603 0.54273397]


In [8]:
#test modele Words+Quotes sur les 200 phrases :
from sklearn.metrics import accuracy_score

# Mes 200 Phrases
X_phrasesWordsQuotes= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["nb Mots","nb Quotes"],nrows=200)
X_phrasesWordsQuotes=np.asarray(X_phrasesWordsQuotes, dtype=np.int).reshape(-1,2)

#On teste la proba (0 ou 1) sur nos 200 phrases
y_probaWordsQuotes = log_reg.predict(X_phrasesWordsQuotes)  #log_reg.predict_proba pour avoir la proba de chacun
print("y_probaWordsQuotes : ")
print(y_probaWordsQuotes)

#y_true est le tableau avec les vraies reponses d'entites nommes
y_true=pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["Entités Nommées (Nom propres)"],nrows=200)
y_true=np.asarray(y_true, dtype=np.int).reshape(-1,)
print("y_true : ")
print(y_true)


count_a,count_b=0,0
for i in range(len(y_true)):
    if(y_true[i]==1):
        count_a=count_a+1
    if(y_probaWordsQuotes[i]==1):
        count_b=count_b+1
print("accuracy_score : "+str(accuracy_score(y_true,y_probaWordsQuotes)))  # Score est élevé grace aux nombreux 0 ....

E2=count_b/count_a
print("E2= nb  Reponses 'OUI' Détectes/Nb phrases ayant une Entité Nommée : "+str(E2))  #Score très faible pour les 1 donc ..

# accuracy_score compte le nb de reponses identiques 
E1= accuracy_score(y_true,y_probaWordsQuotes,normalize=False) /200  #160/200=0.8
print("E1=nb Vraies Bonnes reponses / nb Phrases Totales : " + str(E1))



y_probaWordsQuotes : 
[0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
y_true : 
[0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 1 0 1 0 1 1 0 0 1 0 0
 1 0 0 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0
 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
accuracy_score : 0.8
E2= nb  Reponses 'OUI' Détectes/Nb phrases ayant une Entité Nommée : 0.21951219512195122
E1=nb Vraies Bonnes reponses / nb Phrases To

In [9]:
#Test sur Uniquement Words
from sklearn.metrics import accuracy_score

# Mes 200 Phrases
X_phrasesWords= pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["nb Mots"],nrows=200)
X_phrasesWords=np.asarray(X_phrasesWords, dtype=np.int).reshape(-1,1)

#On teste la proba (0 ou 1) sur nos 200 phrases
y_probaWords = log_reg.predict_proba(X_phrasesWords)  #log_reg.predict_proba pour avoir la proba de chacun
print("y_probaWords : ")
print(y_probaWords)

#y_true est le tableau avec les vraies reponses d'entites nommes
y_true=pd.read_csv("tableur_606EN.csv", encoding = "ISO-8859-1",sep=";",header=0,usecols = ["Entités Nommées (Nom propres)"],nrows=200)
y_true=np.asarray(y_true, dtype=np.int).reshape(-1,)
print("y_true : ")
print(y_true)


count_a,count_b=0,0
for i in range(len(y_true)):
    if(y_true[i]==1):
        count_a=count_a+1
    if(y_probaWordsQuotes[i]==1):
        count_b=count_b+1
print("accuracy_score : "+str(accuracy_score(y_true,y_probaWordsQuotes)))  # Score est élevé grace aux nombreux 0 ....

E2=count_b/count_a
print("E2= nb  Reponses 'OUI' Détectes/Nb phrases ayant une Entité Nommée : "+str(E2))  #Score très faible pour les 1 donc ..

# accuracy_score compte le nb de reponses identiques 
E1= accuracy_score(y_true,y_probaWordsQuotes,normalize=False) /200  #160/200=0.8
print("E1=nb Vraies Bonnes reponses / nb Phrases Totales : " + str(E1))



ValueError: X has 1 features per sample; expecting 2