<a href="https://colab.research.google.com/github/anshikatyagi23/Machine-Learning/blob/main/Feature_extraction_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

About the Dataset:

id: unique id for a news article
title: the title of a news article
author: author of the news article
text: the text of the article; could be incomplete
label: a label that marks whether the news article is real or fake: 1: Fake news 0: real News

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/train.csv')

In [None]:
news_dataset.shape

(891, 12)

In [None]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:

# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [None]:
# merging the author name and news title
news_dataset['Ticket'] = news_dataset['Name']+' '+news_dataset['Cabin']

In [None]:
print(news_dataset['Ticket'])

0                               Braund, Mr. Owen Harris 
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Laina 
3      Futrelle, Mrs. Jacques Heath (Lily May Peel) C123
4                              Allen, Mr. William Henry 
                             ...                        
886                               Montvila, Rev. Juozas 
887                     Graham, Miss. Margaret Edith B42
888            Johnston, Miss. Catherine Helen "Carrie" 
889                           Behr, Mr. Karl Howell C148
890                                 Dooley, Mr. Patrick 
Name: Ticket, Length: 891, dtype: object


In [None]:
# separating the data & label
X = news_dataset.drop(columns='Fare', axis=1)
Y = news_dataset['Fare']

In [None]:
print(X)
print(Y)



     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [None]:

port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [None]:
news_dataset['Ticket'] = news_dataset['Ticket'].apply(stemming)

In [None]:
print(news_dataset['Ticket'])

0                             braund mr owen harri
1      cume mr john bradley florenc brigg thayer c
2                             heikkinen miss laina
3            futrel mr jacqu heath lili may peel c
4                           allen mr william henri
                          ...                     
886                             montvila rev juoza
887                   graham miss margaret edith b
888             johnston miss catherin helen carri
889                           behr mr karl howel c
890                              dooley mr patrick
Name: Ticket, Length: 891, dtype: object


In [None]:
#separating the data and label
X = news_dataset['Ticket'].values
Y = news_dataset['Fare'].values

In [None]:
print(X)

['braund mr owen harri' 'cume mr john bradley florenc brigg thayer c'
 'heikkinen miss laina' 'futrel mr jacqu heath lili may peel c'
 'allen mr william henri' 'moran mr jame' 'mccarthi mr timothi j e'
 'palsson master gosta leonard'
 'johnson mr oscar w elisabeth vilhelmina berg'
 'nasser mr nichola adel achem' 'sandstrom miss marguerit rut g'
 'bonnel miss elizabeth c' 'saundercock mr william henri'
 'andersson mr ander johan' 'vestrom miss hulda amanda adolfina'
 'hewlett mr mari kingcom' 'rice master eugen' 'william mr charl eugen'
 'vander plank mr juliu emelia maria vandemoortel' 'masselmani mr fatima'
 'fynney mr joseph j' 'beesley mr lawrenc' 'mcgowan miss anna anni'
 'sloper mr william thompson' 'palsson miss torborg danira'
 'asplund mr carl oscar selma augusta emilia johansson'
 'emir mr far chehab' 'fortun mr charl alexand c c c'
 'dwyer miss ellen nelli' 'todoroff mr lalio' 'uruchurtu manuel e'
 'spencer mr william augustu mari eugeni b' 'glynn miss mari agatha'
 'wheadon 

In [None]:
print(Y)

[  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55     8.05    31.275    7.8542  16.
  29.125   13.      18.       7.225   26.      13.       8.0292  35.5
  21.075   31.3875   7.225  263.       7.8792   7.8958  27.7208 146.5208
   7.75    10.5     82.1708  52.       7.2292   8.05    18.      11.2417
   9.475   21.       7.8958  41.5792   7.8792   8.05    15.5      7.75
  21.6792  17.8     39.6875   7.8     76.7292  26.      61.9792  35.5
  10.5      7.2292  27.75    46.9      7.2292  80.      83.475   27.9
  27.7208  15.2458  10.5      8.1583   7.925    8.6625  10.5     46.9
  73.5     14.4542  56.4958   7.65     7.8958   8.05    29.      12.475
   9.       9.5      7.7875  47.1     10.5     15.85    34.375    8.05
 263.       8.05     8.05     7.8542  61.175   20.575    7.25     8.05
  34.6542  63.3583  23.      26.       7.8958   7.8958  77.2875   8.6542
   7.925    7.8958   7.65     7.775    7.8958  24.15    52.      14.4542
  

In [None]:
Y.shape

(891,)

In [None]:
# convert the textual data to Feature Vectors
vectorizer = TfidfVectorizer()

In [None]:
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:


print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3561 stored elements and shape (891, 1473)>
  Coords	Values
  (0, 173)	0.6122664812320111
  (0, 564)	0.48548528626888116
  (0, 986)	0.12067952906754793
  (0, 1068)	0.6122664812320111
  (1, 170)	0.4343749521361649
  (1, 179)	0.46068230566973467
  (1, 290)	0.46068230566973467
  (1, 445)	0.3894022342249528
  (1, 690)	0.25867146346989556
  (1, 986)	0.08561658406818297
  (1, 1336)	0.4012316063594593
  (2, 583)	0.6847964246548636
  (2, 767)	0.6847964246548636
  (2, 959)	0.2492141921316511
  (3, 477)	0.4055685568087742
  (3, 579)	0.4055685568087742
  (3, 658)	0.3881410213219276
  (3, 813)	0.4055685568087742
  (3, 917)	0.4055685568087742
  (3, 986)	0.07993875859707769
  (3, 1092)	0.43013128850773164
  (4, 45)	0.7624999049002402
  (4, 591)	0.48269382745394174
  (4, 986)	0.1502909799214403
  (4, 1445)	0.4037489137065785
  :	:
  (884, 1319)	0.7127827059432275
  (885, 878)	0.4619947096937033
  (885, 986)	0.1165984344700052
  (885, 1038)