## Loading the standard libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [3]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Obsrevations:

1. Category contains only 2 value spam or ham, spam means harmful message and ham means safe message
2. Message is independent data
3. Category is dependent data

In [4]:
data.shape

(5572, 2)

### Rearrange the data with target as the last column in the dataset

In [7]:
data = data[['Message', 'Category']]
data.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


## Notes:

1. In order to apply any Classification ML algorithm on the data, the data must be encoded to numerical.

In [8]:
### Apply encoding on category
dic = {'ham' : 0, 'spam' : 1}
data['Category'] = data['Category'].replace(dic)
data.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
## Apply encoding on Message

data['Message'].iloc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
## Apply One Hot encoding on the first row of the Message column

pd.get_dummies(data['Message'].iloc[0])

Unnamed: 0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
0,1


### To encode these kinds of heterogenous data we use CountVectorizer()

In [16]:
text = ['It was the best of times', 'India, is an, incredible country', 'I stay in India']
text

['It was the best of times',
 'India, is an, incredible country',
 'I stay in India']

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [18]:
vec.fit(text)

In [19]:
vec.vocabulary_

{'it': 7,
 'was': 12,
 'the': 10,
 'best': 1,
 'of': 8,
 'times': 11,
 'india': 5,
 'is': 6,
 'an': 0,
 'incredible': 4,
 'country': 2,
 'stay': 9,
 'in': 3}

In [20]:
text

['It was the best of times',
 'India, is an, incredible country',
 'I stay in India']

In [21]:
vec.transform(text)

<3x13 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [22]:
vec.transform(text).toarray()

array([[0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1],
       [1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [23]:
vec.get_feature_names_out()

array(['an', 'best', 'country', 'in', 'incredible', 'india', 'is', 'it',
       'of', 'stay', 'the', 'times', 'was'], dtype=object)

In [24]:
res = pd.DataFrame(vec.transform(text).toarray(), columns = vec.get_feature_names_out())
res

Unnamed: 0,an,best,country,in,incredible,india,is,it,of,stay,the,times,was
0,0,1,0,0,0,0,0,1,1,0,1,1,1
1,1,0,1,0,1,1,1,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,1,0,0,0


In [25]:
text

['It was the best of times',
 'India, is an, incredible country',
 'I stay in India']

In [26]:
text2 = ['I am interested to learn Data Science', 'I love my country', 'The times of India']
text2

['I am interested to learn Data Science',
 'I love my country',
 'The times of India']

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [28]:
vec.fit_transform(text2)

<3x13 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [34]:
vec.fit_transform(text2).toarray()

array([[1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0]], dtype=int64)

In [30]:
vec.get_feature_names_out()

array(['am', 'country', 'data', 'india', 'interested', 'learn', 'love',
       'my', 'of', 'science', 'the', 'times', 'to'], dtype=object)

In [33]:
res2 = pd.DataFrame(vec.fit_transform(text2).toarray(), columns = vec.get_feature_names_out())
res2

Unnamed: 0,am,country,data,india,interested,learn,love,my,of,science,the,times,to
0,1,0,1,0,1,1,0,0,0,1,0,0,1
1,0,1,0,0,0,0,1,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,1,1,0


## Spam Ham Classification

In [35]:
data.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## Seperate X and y from the data

In [45]:
X = data['Message']
y = data['Category']

In [46]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

## Encode X using CountVectorizer()

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [48]:
X_vec = vec.fit_transform(X)
X_vec

<5572x8709 sparse matrix of type '<class 'numpy.int64'>'
	with 74098 stored elements in Compressed Sparse Row format>

In [49]:
vec.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

## Split the data into train and test sets

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3, random_state = 0)

## Apply Logistic REgression on the train set

In [51]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [52]:
lr.fit(X_train, y_train)

## Perform predictions on X_test

In [66]:
y_pred = lr.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [56]:
pd.DataFrame(X_test.toarray(), columns = vec.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1668,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
message1 = [
    'Even my brother does not like to speak with me. They treat me as a aids patient'
]

In [58]:
lr.predict(vec.transform(message1))

array([0], dtype=int64)

In [60]:
message2 = [
    'England vs Macedonia - dont miss the goals/team news. txt ur national team to 87077 eg ENGLAND TO 87077Try:Wales, SCOTLAND 4txt/A~0&.20'
]
message2

['England vs Macedonia - dont miss the goals/team news. txt ur national team to 87077 eg ENGLAND TO 87077Try:Wales, SCOTLAND 4txt/A~0&.20']

In [61]:
lr.predict(vec.transform(message2))

array([1], dtype=int64)

In [62]:
message3 =[
    'FREE!, WINNER!, You have won lottery ticket. You are entitled to win a Audi A8. To claim call 872345798'
]
message3

['FREE!, WINNER!, You have won lottery ticket. You are entitled to win a Audi A8. To claim call 872345798']

In [64]:
lr.predict(vec.transform(message3))

array([1], dtype=int64)

## Check Accuracy

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9760765550239234

In [68]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1445,    6],
       [  34,  187]], dtype=int64)

1445 - False Negative - Actual data says it is ham message and prediction also say ham  
187 - True Positive - Actual data says the message as a spam and prediction also says the message as spam  
34 - True Negative - Actual data says the message is a spam and prediction says that the message is ham   
6 - False Postive. - Actual data says the message is a ham and prediction say that the message is a spam  

## Create a pipeline to perform text processing

- To create a pipeline we follow the below steps
1. Import all the necessary libraries
2. Define the steps needed to create the pipeline
3. Create the pipeline using the pipeline function from sklearn.pipeline library

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec

In [71]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [73]:
## Define the steps to create the pipeline

steps = [('Text Processing', vec), 
        ('ML MOdelling', lr)]
steps

[('Text Processing', CountVectorizer()),
 ('ML MOdelling', LogisticRegression())]

In [75]:
## Create the pipeline using sklearn.pipeline library

from sklearn.pipeline import Pipeline
pipe = Pipeline(steps)
pipe

dataset = pd.read_csv(

In [76]:
dataset = pd.read_csv('train.tsv', sep = '\t')
dataset.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [77]:
dataset.shape

(1482535, 8)

## Observations:

- Data contains 14 lakh 82 thousand 5 hundred and thirty five rows.
- Data contains only 8 columns

## Notes:

- On the above data, following can be considered
1. Create a num_pipeline for all the numerical variables
2. Create a categorical pipeline for the categorical variables
3. Create the text pipeline
4. Seperate the num_variables, cat_varialbes, text_varialbes
5. Create a Column Transformer
6. Apply the Coloum Transformer on the data
7. Apply ML
8. Check accuracy

In [80]:
dataset['item_description']

0                                         No description yet
1          This keyboard is in great condition and works ...
2          Adorable top with a hint of lace and a key hol...
3          New with tags. Leather horses. Retail for [rm]...
4                  Complete with certificate of authenticity
                                 ...                        
1482530    Lace, says size small but fits medium perfectl...
1482531     Little mermaid handmade dress never worn size 2t
1482532            Used once or twice, still in great shape.
1482533    There is 2 of each one that you see! So 2 red ...
1482534    New with tag, red with sparkle. Firm price, no...
Name: item_description, Length: 1482535, dtype: object