## Chapter 8: Logistic Regression Model 


In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score

### Bài tập 1: SelectKBest Breast Cancer 


In [2]:
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

In [3]:
X.shape, y.shape

((569, 30), (569,))

In [4]:
breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
num_feature_to_select = 10
k_best = SelectKBest(score_func=f_classif, k=num_feature_to_select)
X_train_kbest = k_best.fit_transform(X_train, y_train)
X_test_kbest = k_best.transform(X_test)

In [7]:
model = LogisticRegression()
model.fit(X_train_kbest, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
y_pred = model.predict(X_test_kbest)

In [9]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy score: {accuracy}')

Accuracy score: 0.9912280701754386


### Bài tập 2: Movie Review 

In [10]:
from nltk.corpus import stopwords
from wordcloud import WordCloud
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import matplotlib.pyplot as plt

In [11]:
mobile_review = pd.read_csv('data/data/movie_review.csv')

#### Read and overview data

In [12]:
mobile_review.head()

Unnamed: 0,text,sentiment
0,For a movie that gets no respect there sure ar...,0
1,Bizarre horror movie filled with famous faces ...,0
2,"A solid, if unremarkable film. Matthau, as Ein...",0
3,It's a strange feeling to sit alone in a theat...,0
4,"You probably all already know this by now, but...",0


In [13]:
mobile_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       25000 non-null  object
 1   sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [14]:
mobile_review.columns

Index(['text', 'sentiment'], dtype='object')

#### Check dup

In [15]:
mobile_review.text.duplicated().sum()

np.int64(96)

In [16]:
mobile_review.drop_duplicates(ignore_index=True, inplace=True)

In [17]:
mobile_review.duplicated().any()

np.False_

In [18]:
mobile_review.shape

(24904, 2)

#### Check imbalance dataset 

In [19]:
mobile_review.sentiment.value_counts()

sentiment
0    12472
1    12432
Name: count, dtype: int64

#### Data cleaning and preprocessing

In [20]:
from utils import clean_text

In [None]:
cleaner = clean_text()
mobile_review.text = mobile_review.text.apply(lambda x: cleaner.cleaning(x))

In [None]:
# import pandas 
# import tqdm
# tqdm.tqdm.pandas()
# df_lang = mobile_review.text.progress_apply(lambda x: detect(x))

100%|██████████| 24904/24904 [02:56<00:00, 140.98it/s]


In [31]:
df_lang.value_counts()

text
en    24884
af        7
fr        6
nl        4
pt        1
no        1
da        1
Name: count, dtype: int64

In [32]:
mobile_review = mobile_review[df_lang=='en']

In [40]:
X =  mobile_review['text']
y = mobile_review['sentiment']

#### Build model with counvecterize

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
vectorizer = CountVectorizer()
X_train_extracted = vectorizer.fit_transform(X_train)
X_test_extracted = vectorizer.transform(X_test)

In [43]:
model_1 = LogisticRegression()
model_1.fit(X_train_extracted, y_train)
y_pred = model_1.predict(X_test_extracted)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8784408278079164

#### Build model with Tf-idf-vectorize

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
vectorizer = TfidfVectorizer()
X_train_extracted = vectorizer.fit_transform(X_train)
X_test_extracted = vectorizer.transform(X_test)

In [46]:
model_2 = LogisticRegression()
model_2.fit(X_train_extracted, y_train)
y_pred = model_2.predict(X_test_extracted)
accuracy_score(y_test, y_pred)

0.8910990556560177

### Bài tập 3: PipeLine - Titanic 


#### Read and overview

In [96]:
url='https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic = pd.read_csv(url)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [97]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [98]:
titanic.shape

(891, 12)

In [99]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Data preprocessing

In [100]:
titanic.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], axis = 1, inplace=True )

In [101]:
titanic.Sex = titanic.Sex.apply(lambda x: 0 if x=='male' else 1)

In [102]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [103]:
titanic.Age.fillna(titanic.groupby('Pclass')['Age'].transform('mean'), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic.Age.fillna(titanic.groupby('Pclass')['Age'].transform('mean'), inplace=True)


In [104]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [105]:
titanic['Familysize'] = titanic['Parch'] + titanic['SibSp']

In [106]:
class_group = titanic.groupby('Pclass')['PassengerId'].count().reset_index()
class_group.columns = ['Pclass', 'ClassCount']
titanic = titanic.merge(class_group, on='Pclass', how='left')

In [107]:
X = titanic.drop(columns=['Survived', 'SibSp', 'Parch'])
y = titanic['Survived']

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
_pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('SelectFeature', SelectKBest()),
        ('modeling', LogisticRegression())
    ]
)

In [110]:
_pipeline.fit(X_train, y_train)



In [111]:
y_pred = _pipeline.predict(X_test)

In [112]:
accuracy_score(y_test, y_pred)

0.8340807174887892