# Feature extraction of Text data using Tf-idf Vectorizer

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
emp = pd.read_csv('./../../dataset/employee.csv')
emp.head()

Unnamed: 0,name,lastName,gender,age,education,salary,score
0,Ali,Rahmati,male,20.0,master,40000.0,80.0
1,Assadullah,Rahimi,male,20.0,bachler,31000.0,98.0
2,Abdullah,Rahnaward,male,25.0,bachler,45000.0,99.0
3,Ahamd,Ahmadi,male,18.0,doctor,23000.0,88.0
4,Zahra,Rasoli,female,25.0,master,14000.0,75.0


In [5]:
emp['info'] = emp.name+' '+emp.lastName+' is one of the best employees'

In [6]:
emp.head()

Unnamed: 0,name,lastName,gender,age,education,salary,score,info
0,Ali,Rahmati,male,20.0,master,40000.0,80.0,Ali Rahmati is one of the best employees
1,Assadullah,Rahimi,male,20.0,bachler,31000.0,98.0,Assadullah Rahimi is one of the best employees
2,Abdullah,Rahnaward,male,25.0,bachler,45000.0,99.0,Abdullah Rahnaward is one of the best employees
3,Ahamd,Ahmadi,male,18.0,doctor,23000.0,88.0,Ahamd Ahmadi is one of the best employees
4,Zahra,Rasoli,female,25.0,master,14000.0,75.0,Zahra Rasoli is one of the best employees


# Stemming

## Stemming is the process of reducing the words to its Rot
## example : walking walked walks to walk

In [7]:
port_stem = PorterStemmer()

In [10]:
def stemming(info):
    stemming_info = re.sub('[^a-zA-Z]',' ',info) # re is regolar axpration this line get all letters not ,.and athor sombolse
    stemming_info = stemming_info.lower()
    stemming_info = stemming_info.split()
    stemming_info = [port_stem.stem(word) for word in stemming_info
                      if not word in stopwords.words('english')]
    stemming_info = ' '.join(stemming_info)
    return stemming_info

In [11]:
emp['info'] = emp['info'].apply(stemming)

In [12]:
x = emp.drop('gender',axis=1)

In [13]:
y = emp.gender

In [14]:
x = emp['info'].values
y = emp['gender'].values

In [15]:
y.shape

(81,)

In [16]:
emp.head()

Unnamed: 0,name,lastName,gender,age,education,salary,score,info
0,Ali,Rahmati,male,20.0,master,40000.0,80.0,ali rahmati one best employe
1,Assadullah,Rahimi,male,20.0,bachler,31000.0,98.0,assadullah rahimi one best employe
2,Abdullah,Rahnaward,male,25.0,bachler,45000.0,99.0,abdullah rahnaward one best employe
3,Ahamd,Ahmadi,male,18.0,doctor,23000.0,88.0,ahamd ahmadi one best employe
4,Zahra,Rasoli,female,25.0,master,14000.0,75.0,zahra rasoli one best employe


### convert the textual data to Feature Vectors

In [23]:
vectorizer = TfidfVectorizer()

In [24]:
vectorizer.fit(x)
x = vectorizer.transform(x)

In [26]:
print(x)

  (0, 22)	0.19892356238949127
  (0, 18)	0.6174854241023203
  (0, 16)	0.19892356238949127
  (0, 15)	0.19892356238949127
  (0, 9)	0.19892356238949127
  (0, 6)	0.19892356238949127
  (0, 5)	0.19892356238949127
  (0, 3)	0.6174854241023203
  (1, 22)	0.19892356238949127
  (1, 17)	0.6174854241023203
  (1, 16)	0.19892356238949127
  (1, 15)	0.19892356238949127
  (1, 9)	0.19892356238949127
  (1, 6)	0.19892356238949127
  (1, 5)	0.19892356238949127
  (1, 4)	0.6174854241023203
  (2, 22)	0.19892356238949127
  (2, 19)	0.6174854241023203
  (2, 16)	0.19892356238949127
  (2, 15)	0.19892356238949127
  (2, 9)	0.19892356238949127
  (2, 6)	0.19892356238949127
  (2, 5)	0.19892356238949127
  (2, 0)	0.6174854241023203
  (3, 22)	0.19892356238949127
  :	:
  (77, 5)	0.19892356238949127
  (78, 22)	0.19892356238949127
  (78, 16)	0.19892356238949127
  (78, 15)	0.19892356238949127
  (78, 14)	0.6174854241023203
  (78, 9)	0.19892356238949127
  (78, 8)	0.6174854241023203
  (78, 6)	0.19892356238949127
  (78, 5)	0.19892356