In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
from warnings import filterwarnings

In [2]:
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.width', 200)

In [35]:
file_path = "../datasets/wiki-221126-161428/wiki_data.csv"
df = pd.read_csv(file_path, index_col=0)

In [36]:
df.head()

Unnamed: 0,text
1,Anovo\n\nAnovo (formerly A Novo) is a computer...
2,Battery indicator\n\nA battery indicator (also...
3,"Bob Pease\n\nRobert Allen Pease (August 22, 19..."
4,CAVNET\n\nCAVNET was a secure military forum w...
5,CLidar\n\nThe CLidar is a scientific instrumen...


In [37]:
df = df[:2000]

In [38]:
def clean_text(text):
    """
    Parameters:
        text (str): The information text given in str format
        
    Returns:
        str: A preprocessed text
    """
    text = text.lower()
    text = text.replace('[^\w\s]', '', regex=True)
    text = text.replace('\d', '', regex=True)
    text = text.replace('\n', '', regex=True)
    return text

In [39]:
df["text"] = df["text"].str.replace('[^\w\s]', '', regex=True)

In [40]:
df["text"] = df["text"].str.replace('\d', '', regex=True)

In [41]:
df["text"] = df["text"].str.replace('\n', '', regex=True)

In [42]:
df["text"].head()

1    AnovoAnovo formerly A Novo is a computer servi...
2    Battery indicatorA battery indicator also know...
3    Bob PeaseRobert Allen Pease August  Â â June  ...
4    CAVNETCAVNET was a secure military forum which...
5    CLidarThe CLidar is a scientific instrument us...
Name: text, dtype: object

In [43]:
sw = stopwords.words("english")
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [44]:
df["text"] = df["text"].apply(lambda x: " ".join(i for i in str(x).split() if i not in sw))

In [45]:
df["text"].head()

1    AnovoAnovo formerly A Novo computer services c...
2    Battery indicatorA battery indicator also know...
3    Bob PeaseRobert Allen Pease August Â â June an...
4    CAVNETCAVNET secure military forum became oper...
5    CLidarThe CLidar scientific instrument used me...
Name: text, dtype: object

Metinde az geçen (1000'den az, 2000'den az gibi) kelimeleri bulunuz. Ve bu kelimeleri metin içerisinden çıkartınız.

In [46]:
pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]

tractorsBulldozers    1
militarized           1
Intifada              1
Tournadozer           1
suppliesA             1
                     ..
portfire              1
bitumen               1
wildfireBy            1
StarSpangled          1
fluidssolids          1
Name: count, Length: 1000, dtype: int64

In [47]:
sil = pd.Series(' '.join(df["text"]).split()).value_counts()[-1000:]
df["text"] = df["text"].apply(lambda x: ' '.join(i for i in str(x).split() if i not in sil))

In [48]:
df["text"].head()

1    AnovoAnovo formerly A Novo computer services c...
2    Battery indicatorA battery indicator also know...
3    Bob PeaseRobert Allen Pease August Â â June an...
4    CAVNETCAVNET secure military forum became oper...
5    CLidarThe CLidar scientific instrument used me...
Name: text, dtype: object

Adım6: Metinleri tokenize edip sonuçları gözlemleyiniz.

In [49]:
df["text"].apply(lambda x: TextBlob(x).words)

1       [AnovoAnovo, formerly, A, Novo, computer, serv...
2       [Battery, indicatorA, battery, indicator, also...
3       [Bob, PeaseRobert, Allen, Pease, August, Â, â,...
4       [CAVNETCAVNET, secure, military, forum, became...
5       [CLidarThe, CLidar, scientific, instrument, us...
                              ...                        
1996    [Edinburgh, Calotype, ClubThe, Edinburgh, Calo...
1997    [EndrinEndrin, organochloride, chemical, formu...
1998    [Ethylene, glycol, dinitrateEthylene, glycol, ...
1999    [Forges, de, SyamThe, Forges, de, Syam, Syam, ...
2000    [FrankKamenetskii, theoryIn, combustion, Frank...
Name: text, Length: 2000, dtype: object

Adım7: Lemmatization işlemi yapınız.

In [61]:
df["text"] = df["text"].apply(lambda x: ' '.join([Word() for word in ]))

'ran'