In [22]:
import string
import pandas as pd
import numpy as np

from collections import Counter
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import linear_model
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=5)
%matplotlib inline
pd.options.display.max_columns=1000

Source: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/

In [23]:
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None)

In [24]:
df.columns = ['spam', 'text']

In [25]:
df.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
# Get rid of the punctuation
translator = str.maketrans('', '', string.punctuation)
df.text = df.text.apply(lambda x: x.translate(translator))
df.head()

Unnamed: 0,spam,text
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...


In [27]:
df['spam'] = df['spam'] == 'spam' # makes True/False nstead of "spam" and "ham"
df['spam'] = df['spam'].astype(int)  # number values instead of boolean value

In [28]:
df.text[0]

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [29]:
df.head()

Unnamed: 0,spam,text
0,0,Go until jurong point crazy Available only in ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf he lives aroun...


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
spam    5572 non-null int32
text    5572 non-null object
dtypes: int32(1), object(1)
memory usage: 65.4+ KB


In [31]:
# Use TweetTokenizer 
tknzr = TweetTokenizer()
df['text'] = df.text.apply(tknzr.tokenize)
df['text'].head()

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, dont, think, he, goes, to, usf, he, l...
Name: text, dtype: object

In [32]:
# Adding a new feature 'num_words' - how many words are in a message
df['num_words'] = df.text.apply(len)
df['num_words'].head()

0    20
1     6
2    30
3    11
4    13
Name: num_words, dtype: int64

In [33]:
df.text[0]

['Go',
 'until',
 'jurong',
 'point',
 'crazy',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'there',
 'got',
 'amore',
 'wat']

In [34]:
len(df.text)

5572

In [35]:
len(df.text[5571])

6

In [55]:
df.text[0]

['Go',
 'until',
 'jurong',
 'point',
 'crazy',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'Cine',
 'there',
 'got',
 'amore',
 'wat']

In [None]:
# Here for each element of the list we need to get the number of characters and then sum it up
# Is there to do it more effectively then brute force?

#HELL YEAH! LOOK below. 
#L = []
#for i in range(len(df.text)):
#    sum = 0
#    for j in range(len(df.text[i])):
#        sum += len(df.text[i][j])
#    L.append(sum)  

Here I want to have 'raw' messages after getting rid of punctuation and applying TweetTokenizer. I want it because I'll later use it as to transform_fit tfidf.

In [66]:
documents = []
for i in range(len(df.text)):
    documents.append(join_string(df.text[i]))
        
documents[:5]

['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
 'Ok lar Joking wif u oni',
 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075 over 18s',
 'U dun say so early hor U c already then say',
 'Nah I dont think he goes to usf he lives around here though']

In [69]:
L = []
for i in documents:
    L.append(len(i))
L[:5]

[102, 23, 151, 43, 59]

In [70]:
# adding a new feature 'length' - how many characters a message have (spaces are not included)
df['length'] = L
df['length'].head()

0    102
1     23
2    151
3     43
4     59
Name: length, dtype: int64

In [71]:
df.columns

Index(['spam', 'text', 'num_words', 'length'], dtype='object')

In [72]:
df.text.head()

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, dont, think, he, goes, to, usf, he, l...
Name: text, dtype: object

In [73]:
# Creating a new DataFrame df_words from bag of words
df_words = df.text.apply(Counter).apply(dict).values #list of dictoniaries to DF
df_words = pd.DataFrame(list(df_words))

In [74]:
df_words.columns

Index(['0', '0089my', '00Please', '01', '0121 2025050', '01223585236',
       '01223585334', '01256987', '02', '020603',
       ...
       'ü', 'üll', '–', '—', '‘', '’', '“', '…', '┾', '〨ud'],
      dtype='object', length=11662)

In [75]:
df_words['length'] = df['length']
df_words['num_words'] = df['num_words']
df_words['spam'] = df.spam

In [76]:
df_words.columns

Index(['0', '0089my', '00Please', '01', '0121 2025050', '01223585236',
       '01223585334', '01256987', '02', '020603',
       ...
       '–', '—', '‘', '’', '“', '…', '┾', '〨ud', 'num_words', 'spam'],
      dtype='object', length=11664)

In [77]:
df_words = df_words.fillna(0).astype(int)
df_words.head()

Unnamed: 0,0,0089my,00Please,01,0121 2025050,01223585236,01223585334,01256987,02,020603,0207 153 9153,0207 153 9996,02070836089,02072069400,02073162414,02085076972,020903,03530150,040902,050703,06,060505,061104,07008009200,07046744435,07090201529,07090298926,07099833605,071104,07123456789,07732584351,07734396839,07742676969,07753741225,0776xxxxxxx,07781482378,07786200117,077xxx,078,07801543489,07808,07808247860,07808726822,07815296484,07821230901,0784987,07880867867,0789xxxxxxx,07946746291,0796XXXXXX,07973788240,07XXXXXXXXX,07xxxxxxxxx,08,0800,0800 0721072,0800 169 6031,0800 195 6669,0800 1956669,0800 5050,0800 542 0578,0800 542 0825,08000407165,08000776320,08000839402,08000930705,08000938767,08001950382,08002888812,08002986030,08002986906,08002988890,08006344447,0808 145 4742,08081263000,08081560665,0819,0844,08448350055,08448714184,0845 021 3680,0845 2814032,08450542832,08452810071,08452810073,08452810075,0870,08700469649,08700621170,08701213186,08701237397,08701417012,08701624,08701752560,08701872873,08702411827,08702490080,08702840625,08704439680,08706091795,08707379102,08707500020,08707509020,08707533310,08707808226,08708034412,08708800282,08709222922,08709501522,0870k,08710471114,08712101358,08712103738,08712120250,08712300220,08712317606,08712400200,08712400602,08712400603,08712402050,08712402578,08712402779,08712402902,08712402972,08712404000,08712405020,08712405022,08712460324,08712466669,08712778107,08712778108,08712778109,08714342399,08714712377,08714712379,08714712388,08714712394,08714712412,08714714011,08714719523,08714740323,08714742804,08715203028,08715203649,08715203652,08715203656,08715203677,08715203685,08715203694,08715205273,08715500022,08715705022,08717111821,08717168528,08717205546,08717507382,08717507711,08717509990,08717890890,08717895698,08717898035,08718711108,08718720201,08718723815,08718725756,08718726270,08718726970,08718726971,08718726978,08718727200,08718727868,08718727870,08718728876,08718729755,08718729758,08718730555,08718730666,08718738001,08718738002,08718738034,08719180219,08719180248,08719181259,08719181503,08719181513,08719839835,08719899217,08719899229,08719899230,09,09041940223,09050000301,09050000327,09050000332,09050000460,09050000555,09050000878,09050000928,09050001295,09050001808,09050002311,09050003091,09050005321,09050090044,09050280520,09053750005,09056242159,09057039994,09058091854,09058091870,09058094454,09058094455,09058094507,09058094565,09058094583,09058094594,09058094597,09058094599,09058095107,09058095201,09058097189,09058097218,09058098002,09058099801,09061104276,09061104283,09061209465,09061213237,09061221061,09061221066,09061701444,09061701461,09061701851,09061701939,09061702893,09061743386,09061743806,09061743810,09061743811,09061744553,09061749602,09061790121,09061790125,09061790126,09063440451,09063442151,09063458130,09063463,09064011000,09064012103,09064012160,09064015307,09064017295,09064017305,09064018838,09064019014,09064019788,09065069120,09065069154,09065171142,09065174042,09065394514,09065394973,09065989180,09065989182,09066350750,09066358152,09066358361,09066361921,09066362206,09066362220,09066362231,09066364311,09066364349,09066364589,09066368327,09066368470,09066368753,09066380611,09066382422,09066612661,09066649731,09066660100,09071512432,09071512433,09071517866,09077818151,09090204448,09090900040,09094100151,09094646631,09094646899,09095350301,09096102316,09099725823,09099726395,09099726429,09099726481,09099726553,09111030116,09111032124,09701213186,0ANETWORKS,0Hi,0pwk,0quit,1,10,100,1000,10000,100000,1000CALL,1000s,100603,100pSMS,100percentrealcom,100s,100txtmth,1010,1013,101mega,1030,10803,10K,10am,10am7pm,10am9pm,10k,10mins,10p,10pmin,10ppm,10th,11,1120,113,1131,11414,11414TCRW1,1146,1148,116,1172,118pmsg,11mths,12,120,12000pes,1205,121,1225,123,1230,125,1250,125gift,128,1282EssexCM61XN,12Mths,12hours,12hrs,12mths,12n146tf15,12n146tf150p,12price,13,130,131004,1323,1327,13404,139,140,1405 1680,140ppm,1450,146tf150p,14thMarch,150,150 0087040,1500,150Mtmsgrcvd18,150P,150P16,150PPM,150ea,150gbpmtmsg18,150moreFrmMob,150msg,150p,150pMSGRCVD,150pMTmsg,150pMsg,150pMsgrcvdHGSuite3422LandsRowW1J6HL,150pMt,150pMtmsgrcvd18,150pSMS,150pday,150perWKsub,150perweeksub,150pm,150pmeg,150pmin,150pmmorefrommobile2BremovedMobyPOBox734LS27YF,150pmsg,150pmsg2,150ppermessSubscription,150ppm,150ppmPOBox10183BhamB64XE,150ppmsg,150ppmx3age16,150prcvd,150ptext,150ptone,150pw,150pwk,150rcvd,150week,150wk,150x3normal,151,1510,1526,15541,15H,15pmin,16,16150ppermessSubscription,161win150ppmx3,165,16only,16yrs,177,177HP51FL,18,180,181104,1843,186,1896WC1N3XX,18coukwavewaveaspo44345,18only,18ptxt,18s,18yrs,1AppleDayNo,1Cup,1DA,1ER,1Hanuman,1His,1IM,1J6HL,1JHL,1LemonDayNo,1McFlyAll,1N3XX,1Tulsi,1U,1Unbreakable,1Winaweek,1Winawk,1WinawkAge16,1YF,1b6a5ecef91ff937819firsttrue180430JUL05,1childish,1com,1couk,1hr,1j6HL,1million,1minMobsmore,1minMobsmoreLKPOBOX177HP51FL,1minmoremobsEMSPOBox45PO139WA,1month,1more,1pm,1s,1st,1st4Terms,1stchoicecouk,1stone,1thingi,1u,1win150ppmx3age16,1win150ppmx3age16subscription,1x150pwk,2,20,200,2000,20000,2003,2004,2005,2006,2007,200p,202,...,visa,visionsmscom,visit,visiting,visitneed,visitors,vital,vitamin,vivek,viveki,vl,vldo,vodafone,vodka,voice,voicemail,volcanoes,vomit,vomitin,vomiting,vote,voted,voucher,vouchers,vouchersText,vpist,vry,vs,vth,vtired,w,w1t1jy,w8in,wa,wad,wadebridgeI,wahala,wahay,waheed,waheeda,waht,wait,waited,waitin,waiting,wake,waking,waliking,walk,walkabout,walked,walkin,walking,walks,wall,wallet,wallpaper,wallpaperall,walls,walmart,walsall,wamma,wan,wana,wanna,want,wanted,wanting,wants,wap,warm,warming,warned,warner,warning,warranty,warwick,was,washob,wasn,wasnt,waste,wasted,wasting,wat,watch,watched,watches,watchin,watching,watchng,water,watever,watevr,watll,wats,watts,waves,way,waythis,wc,wc1n3xx,we,weak,weakness,weaknesses,weapon,wear,wearing,weaseling,weasels,weather,weathers,web,webadres,website,websitenow,wed,weddin,wedding,wednesday,weds,wee,weed,weeddeficient,week,weekdays,weekend,weekends,weekly,weeks,weekstop,weigh,weighed,weight,weightHaha,weird,weirdest,weirdo,weirdy,welcome,welcomes,well,welli,welltake,wellyou,welp,wen,wenever,went,wer,were,wereare,werent,wesley,wesleys,west,western,westonzoyland,westshore,wet,wetherspoons,weve,what,whatever,whats,wheat,wheel,wheellock,when,whenever,whens,where,whereare,wherebtw,wheres,wherever,whether,which,while,whileamp,whillTake,whispers,white,whn,who,whole,whom,whos,whose,whr,why,wi,wicked,wicket,wid,widelivecomindex,wif,wife,wifedont,wifes,wifi,wihtuot,wikipediacom,wil,wildest,wildlife,will,willing,willpower,win,wind,window,windows,winds,windy,wine,wined,wining,winner,winning,wins,winterstone,wipro,wiproyou,wisdom,wise,wish,wisheds,wishes,wishin,wishing,wishlist,wiskey,wit,with,withdraw,wither,within,without,witin,witot,witout,wiv,wizzle,wk,wkTXT,wkend,wkg,wkly,wknd,wks,wlcome,wld,wmlid,wnt,wo,woah,wocay,woke,woken,woman,womdarfull,women,won,wondar,wondarfull,wonder,wonderful,wondering,wonders,wont,woot,woould,woozles,worc,word,wordCOLLECT,wordSTART,wordnot,words,work,workAnd,workLove,workage,workin,working,workout,works,world,worldgnun,worldmay,worlds,worldvery,worms,worried,worries,worry,worrying,worryuse,worse,worst,worth,worthless,wot,woul,would,woulda,wouldnt,wounds,wow,wquestion,wrecked,wrench,wrenching,write,writhing,wrk,wrking,wrks,wrnog,wrong,wrongly,wrote,ws,wt,wtc,wtf,wth,wthout,wud,wudnt,wuld,wuldnt,wun,www,wwwApplausestorecom,wwwB,wwwIdewcom,wwwLdewcom,wwwLdewcomsubs,wwwSMSacubootydelious,wwwSMSacugoldviking,wwwSMSacuhmmross,wwwSMSacunat,wwwSMSacunatalie,wwwareyouuniquecouk,wwwbridalpetticoatdreamscouk,wwwcashbincouk,wwwclubmobycom,wwwclubzedcouk,wwwcnupdatescomnewsletter,wwwcomuknet,wwwdbuknet,wwwflirtpartyus,wwwfullonsmscom,wwwgambtv,wwwgetzedcouk,wwwldewcom,wwwmovietriviatv,wwwmusictrivianet,wwworangecoukow,wwwphb,wwwregalportfoliocouk,wwwringtonekingcouk,wwwringtonescouk,wwwrtfsphostingcom,wwwsantacallingcom,wwwshortbreaksorguk,wwwsmsconet,wwwtcbiz,wwwtelediscountcouk,wwwtextcompcom,wwwtextpodnet,wwwtklscom,wwwtxt,wwwtxttowincouk,wwwwin,wylie,x,xafter,xam,xavier,xin,xins,xmas,xnet,xt,xuhui,xx,xxSP,xxx,xxxmobilemovieclubcomnQJKGIGHJJGCBL,xxxx,xxxxxxx,xxxxxxxx,xxxxxxxxxxxxxX,xy,y,ya,yah,yahoo,yalrigu,yam,yan,yards,yay,yck,yday,yeah,year,years,yelling,yellow,yeovil,yep,yer,yes,yest,yesterday,yet,yettys,yetunde,yi,yifeng,yijue,yijuehotmailcom,ym,yo,yoHere,yoga,yogasana,yor,yorge,you,youPhone,youTo,youany,youcarlos,youd,youdearwith,youdoing,youhow,youkwhere,yould,youll,youmoney,youmy,young,younger,your,youre,yourinclusive,yours,yourself,youso,youthats,youuuuu,youve,youwanna,youwhen,yoville,yowifes,yr,yrs,ystrdayice,yummy,yun,yuo,yuou,zac,zealand,zed,zeros,zhong,zoom,zyada,,,,,,¡,£,»,Ü,Üll,é,ü,üll,–,—,‘,’,“,…,┾,〨ud,num_words,spam
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0


In [78]:
df_words.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Columns: 11664 entries, 0 to spam
dtypes: int32(11664)
memory usage: 247.9 MB


In [47]:
df.describe()

Unnamed: 0,spam,num_words,length
count,5572.0,5572.0,5572.0
mean,0.134063,15.543431,61.609476
std,0.340751,11.287774,45.970978
min,0.0,0.0,0.0
25%,0.0,7.0,27.0
50%,0.0,12.0,46.0
75%,0.0,23.0,93.0
max,1.0,171.0,718.0


In [49]:
# Finally, save our preprocessed data
#df_words.to_csv('SMS_preprocessed.csv')

In [79]:
tfidf = TfidfTransformer()
vectorizer = CountVectorizer(ngram_range=(1, 2))
#  TfidfTransformer takes the raw term frequencies from CountVectorizer 
# as input and transforms them into tf-idfs

In [80]:
# Linear Regression model
model = linear_model.LinearRegression()

In [81]:
X = tfidf.fit_transform(vectorizer.fit_transform(documents)).toarray()
y = np.array(df_words.spam)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [83]:
model.fit(X_train, y_train)
print("Model core for training set:", model.score(X_train, y_train))
print("Model core for testing set:", model.score(X_test, y_test))

Model core for training set: 1.0
Model core for testing set: 0.8264001954024383
