## Wikipedia Crawl example

Author: J. Hickman

- This code crawls through wikipedia to get a bunch of text data
- The code lets the user specify search category topics.
  - The more different the topics are, the easier the classification will be.
  - For example, i used (pizza, metallurgy, basketball)
- It then searches wikipedia for articles related to these topics
- Loops over the wikipedia pages and gets the text from the wikipedia pages
- Breaks the text into chunks (based on a user input specifying the number of sentences per chunk)
- Each chunk is cleaned and tagged with a "label" (classification) and a numeric "sentiment score" (regression)
- These cleaned chunks form a corpus of strings with associated tags

```
python -m pip install wikipedia_sections
```

### Import

In [1]:
# conda install -c conda-forge wikipedia
# conda install -c conda-forge wordcloud
# pip install wikipedia_sections

import wikipedia
import nltk
import string 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np




In [2]:
# RUN THE FOLLOWING IF YOU HAVEN'T DOWNLOADED THESE BEFORE
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/katherinemead/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/katherinemead/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/katherinemead/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/katherinemead/nltk_data...


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/katherinemead/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Set user parameters 

In [6]:
# PARAMETERS 
label_list=['school district','school takeover','education reform']
max_num_pages=25
sentence_per_chunk=5
min_sentence_length=20

# GET STOPWORDS
from nltk.corpus import stopwords
stop_words=nltk.corpus.stopwords.words('english')

# INITALIZE STEMMER+LEMITZIZER+SIA
sia = SentimentIntensityAnalyzer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

### Define text cleaning function

In [7]:
def clean_string(text):
	# #FILTER OUT UNWANTED CHAR
	new_text=""
	keep=string.printable
	keep=" abcdefghijklmnopqrstuvwxyz0123456789"
	for character in text:
		if character.lower() in keep:
			new_text+=character.lower()
		else: 
			new_text+=" "
	text=new_text
	# print(text)

	# #FILTER OUT UNWANTED WORDS
	new_text=""
	for word in nltk.tokenize.word_tokenize(text):
		if word not in nltk.corpus.stopwords.words('english'):
			#lemmatize 
			tmp=lemmatizer.lemmatize(word)
			# tmp=stemmer.stem(tmp)

			# update word if there is a change
			# if(tmp!=word): print(tmp,word)
			
			word=tmp
			if len(word)>1:
				if word in [".",",","!","?",":",";"]:
					#remove the last space
					new_text=new_text[0:-1]+word+" "
				else: #add a space
					new_text+=word.lower()+" "
	text=new_text.strip()
	return text

# clean_string('the word "pizza" first appeared in a Latin text from the town of Gaeta, then still part of the Byzantine Empire, in 997 AD; the text states that a tenant of certain property is to give the bishop of Gaeta duodecim pizze ("twelve pizzas") every Christmas Day, and another twelve every Easter Sunday.Suggested etymologies include:')


### Preform a wikipedia crawl


In [8]:
#INITIALIZE 
corpus=[]  # list of strings (input variables X)
targets=[] # list of targets (labels or response variables Y)

#--------------------------
# LOOP OVER TOPICS 
#--------------------------
for label in label_list:

	#SEARCH FOR RELEVANT PAGES 
	titles=wikipedia.search(label,results=max_num_pages)
	print("Pages for label =",label,":",titles)

	#LOOP OVER WIKI-PAGES
	for title in titles:
		try:
			print("	",title)
			wiki_page = wikipedia.page(title, auto_suggest=True)

			# LOOP OVER SECTIONS IN ARTICLE AND GET PAGE TEXT
			for section in wiki_page.sections:
				text=wiki_page.section(section); #print(text)

				#BREAK IN TO SENTANCES 
				sentences=nltk.tokenize.sent_tokenize(text)
				counter=0
				text_chunk=''

				#LOOP OVER SENTENCES 
				for sentence in sentences:
					if len(sentence)>min_sentence_length:
						if(counter%sentence_per_chunk==0 and counter!=0):
							# PROCESS COMPLETED CHUNK 
							
							# CLEAN STRING
							text_chunk=clean_string(text_chunk)

							# REMOVE LABEL IF IN STRING (MAKES IT TOO EASY)
							text_chunk=text_chunk.replace(label,"")
							
							# REMOVE ANY DOUBLE SPACES
							text_chunk=' '.join(text_chunk.split()).strip()

							#UPDATE CORPUS 
							corpus.append(text_chunk)

							#UPDATE TARGETS
							score=sia.polarity_scores(text_chunk)
							target=[label,score['compound']]
							targets.append(target)

							#print("TEXT\n",text_chunk,target)

							# RESET CHUNK FOR NEXT ITERATION 
							text_chunk=sentence
						else:
							text_chunk+=sentence
						#print("--------\n", sentence)
						counter+=1

		except:
			print("WARNING: SOMETHING WENT WRONG:", title);  


Pages for label = school district : ['School district', 'List of school districts in Arizona', 'Robb Elementary School shooting', 'School uniform', 'Lockport, Illinois', 'Vacaville, California', 'List of school districts in California', 'School District of Philadelphia', 'Tinker v. Des Moines Independent Community School District', 'Bethel School District v. Fraser', 'List of Long Island public school districts and schools', 'School discipline', 'Homewood-Flossmoor High School', 'List of the largest school districts in the United States by enrollment', 'School bus', 'Magnet school', 'Kennedy v. Bremerton School District', 'Hollywood High School', 'District', 'Hazelwood School District v. Kuhlmeier', 'Los Angeles Unified School District', 'Unified school district', 'Norwalk–La Mirada Unified School District', 'Frisco Independent School District', 'Chicago Public Schools']
	 School district
	 List of school districts in Arizona
	 Robb Elementary School shooting
	 School uniform
	 Lockpor

### Save results

In [9]:
#SANITY CHECKS AND PRINT TO FILE 
print("number of text chunks = ",len(corpus))
print("number of targets = ",len(targets))

tmp=[]
for i in range(0,len(corpus)):
    tmp.append([corpus[i],targets[i][0],targets[i][1]])
df=pd.DataFrame(tmp)
df=df.rename(columns={0: "text", 1: "label", 2: "sentiment"})
print(df)
df.to_csv('wiki-crawl-results.csv',index=False)

number of text chunks =  970
number of targets =  970
                                                  text             label  \
0    12 public school function unit local usually o...   school district   
1    controlling law varies united state operate in...   school district   
2    power tax spend generally limited independent ...   school district   
3    school board may also exercise quasi judicial ...   school district   
4    outside united state autonomous district equiv...   school district   
..                                                 ...               ...   
965  hastings born boston massachusetts father wilm...  education reform   
966  hastings first job adaptive technology created...  education reform   
967  company growth proved challenging hastings lac...  education reform   
968  1997 hastings former pure software employee ma...  education reform   
969  selling pure software hastings found without g...  education reform   

     sentiment  
0       0.6369  

### Extra Code

In [10]:
# #RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT 
df=pd.read_csv('wiki-crawl-results.csv')  
# #print(df)

# #CONVERT FROM STRING LABELS TO INTEGERS 
labels=[]; #y1=[]; y2=[]
y1=[]
for label in df["label"]:
    if label not in labels:
        labels.append(label)
        print("index =",len(labels)-1,": label =",label)
    for i in range(0,len(labels)):
        if(label==labels[i]):
            y1.append(i)
y1=np.array(y1)

# # CONVERT DF TO LIST OF STRINGS 
corpus=df["text"].to_list()
y2=df["sentiment"].to_numpy()

print("number of text chunks = ",len(corpus))
print(len(y1))
print(corpus[0:3])

# # INITIALIZE COUNT VECTORIZER
vectorizer=CountVectorizer()   

# # RUN COUNT VECTORIZER ON OUR COURPUS 
Xs  =  vectorizer.fit_transform(corpus)   
X=np.array(Xs.todense())

# #CONVERT TO ONE-HOT VECTORS
maxs=np.max(X,axis=0)
X=np.ceil(X/maxs)

# # DOUBLE CHECK 
print(X.shape,y1.shape,y2.shape)

index = 0 : label = school district
index = 1 : label = school takeover
index = 2 : label = education reform
number of text chunks =  970
970
['12 public school function unit local usually operate several school largest urban suburban district operate hundred school practice varies significantly state case within state american operate independent local governmental unit grant authority within geographic limit created state law executive legislative power locally controlled policy operation independent case held board education depending state law member local board education often referred informally school board may elected appointed political office holder serve ex officio combination independent legally separate body corporate political', 'controlling law varies united state operate independent local governmental unit exclusive authority 12 public educational operation policy extent control set state level law litigation common law firm specialize school law handle litigation paid 