# Experiments

In [1]:
import vectorizers
import kernels
from nltk.tree import Tree
import numpy as np
from sklearn import svm
from sklearn import cross_validation
from sklearn import feature_extraction

## loading data

In [2]:
#loading files
with open('../data/3gables.csv','r') as f:
	s1 = f.read()

with open('../data/100west.csv','r') as f:
	s2 = f.read()


int2cl = {0:'descriptive', 1:'argumentative', 2:'narrative',3:'explicative'}

t1 = Tree.fromstring(s1)
t2 = Tree.fromstring(s2)
t_list = [t1,t2,t2,t1,t1,t2]

#computing dicts
D = np.array([vectorizers.build_norm_vect(t) for t in t_list])
y = np.array([0,1,1,0,0,1])

#transforming dict to vect
v = feature_extraction.DictVectorizer(sparse=False)
X = v.fit_transform(D)
Y = v.inverse_transform(X)

## loading classifiers

In [3]:
clf2 = svm.LinearSVC()
clf2.fit(X,y)
print(clf2.predict(X))
print [int2cl[x] for x in clf2.predict(X)]

scores = cross_validation.cross_val_score(clf2,X,y,cv=2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


#clf3 = svm.SVC(kernel='rbf')
#clf3.fit(X,y)
#clf3.predict(X)

[0 1 1 0 0 1]
['descriptive', 'argumentative', 'argumentative', 'descriptive', 'descriptive', 'argumentative']
Accuracy: 1.00 (+/- 0.00)


## using precomputed kernels

In [9]:
def compute_kernel(X,Y,kernel=kernels.rbf_kernel):
	"""computes a gram matrix K with matrices X and Y 
	such as K[i,j] = kernel(X[i],Y[j]).
	"""
	K = np.zeros((len(X),len(Y)))
	for i,x in enumerate(X):
		for j,y in enumerate(Y):
			K[i, j] = kernel(x,y)
	return K

#precomputing kernels for train
K = compute_kernel(D,D)
print(K)

clf1 = svm.SVC(kernel="precomputed")
clf1.fit(K,y)

print(clf1.predict(K))
scores = cross_validation.cross_val_score(clf2,X,y,cv=2)


[[ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]
 [ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 1.          0.97938667  0.97938667  1.          1.          0.97938667]
 [ 0.97938667  1.          1.          0.97938667  0.97938667  1.        ]]
[0 1 1 0 0 1]


### Treekernel

In [None]:
K2 = compute_kernel(t_list[:2],t_list[2:],kernels.tree_kernel)

In [None]:
print(K2)
clf4  = svm.SVC(kernel='precomputed')
clf4.fit(K2,y)

### KNN

In [16]:
from sklearn import neighbors
clf5 = neighbors.KNeighborsClassifier()
#clf5.fit(X,y)

### MaxEnt

In [17]:
from sklearn import linear_model
clf6 = linear_model.LogisticRegression()
#clf6.fit(X,y)

### Random Forest

In [15]:
from sklearn import ensemble
clf7 = ensemble.RandomForestClassifier()
#clf7.fit(X,y)

## Wikipedia extraction

In [180]:
import wikipedia as wk
p = wk.search('2010s in film')
page = wk.page(p[0])
s = page.content

In [201]:
import re
reg = re.compile('\n\n\n=== 0-9 ===\n(.*)\n\n\n== See also ==\n',re.DOTALL)
films = reg.findall(s)[0]
films = films.split('\n')

In [307]:
films = [f for f in films if(not ('===' in f) and len(f)>0)]

In [309]:
films[0]
reg2 = re.compile('\(.*\)')
titles = []
for f in films:
    rep = reg2.findall(f)
    if(len(rep)>0):
        titles.append(f.replace(rep[0],''))

In [315]:
reg3 = re.compile('\n\n\n== Plot ==\n(.*?)\n\n\n==',re.DOTALL)
plots = []
for t in titles:
    p = wk.search(t)
    print p
    try:
        page = wk.page(p[0])
    except:
        continue
    cont = page.content
    plot = reg3.findall(cont)
    if(len(plot)>0):
        plots.append((t,plot[0]))
        #print plot[0]
        #print '\n\n\n'
print len(plots)

[u'Abandon', u'The Abandoned (Star Trek: Deep Space Nine)', u'The Abandoned (2006 film)', u'The Abandoned Field: Free Fire Zone', u'The Abandoned (1955 film)', u'Our Lady of the Abandoned Parish Church (Marikina)', u'The Abandoned Soldier', u'To the Abandoned Sacred Beasts', u'The Abandoned Well', u'Abandoned pets']
[u'About a Boy', u'About a Boy (soundtrack)', u'About a Boy (TV series)', u'About a Boy (Homeland)', u'About a Boy (film)', u'About a Boy (novel)', u'Gone Again', u'Benjy', u'Wow! (comic)', u'Pilot (About a Boy)']
[u'About Schmidt', u'Alexander Payne', u"8th Critics' Choice Awards", u'2002 Los Angeles Film Critics Association Awards', u'2002 New York Film Critics Circle Awards', u'Jim Taylor (writer)', u'Hope Davis', u'60th Golden Globe Awards', u'June Squibb', u'Kevin Tent']
[u'Acacia', u'List of Acacia species', u'Acacia pycnantha', u'Vachellia cornigera', u'Acacia Ridge, Queensland', u'Acacia retinodes', u'List of Acacia species used for tannin production', u'Acacia vert

In [317]:
import pandas as pd
df = pd.DataFrame(plots)
df.to_csv("narrative2.csv",sep='\t',encoding='utf-8',index=False,header=['title','plot'])

### 2011

In [324]:
df2 = pd.read_csv("narrative2.csv",sep='\t')

In [327]:
df2.loc[1]['title']

'Accepted '

## Science wikipedia

In [357]:
import wikipedia as wk
p = wk.search('science')
page = wk.page(p[0])
s = page.content
links = page.links
print len(links)

714


In [368]:
reg3 = re.compile('(.*?)\n\n\n==',re.DOTALL)
articles = []
for t in links:
    p = wk.search(t)
    print p
    try:
        page = wk.page(p[0])
    except:
        continue
    cont = page.content
    article = reg3.findall(cont)
    if(len(plot)>0):
        articles.append((t,article[0]))

print len(articles)

[u'A. I. Sabra', u'George Sarton Medal', u'Sabra (name)', u'Fields of Force', u'Moon', u'Alchemy and chemistry in medieval Islam', u'Treatise on Light', u'Science in the medieval Islamic world', u'Abdul Hamid', u'Wheeler Thackston']
[u'A priori and a posteriori', u'A priori', u'A priori probability', u'Empirical evidence', u'Constructed language', u'Epistemology', u'Endurantism', u'Philosophical logic', u'Modal operator', u'List of Boolean algebra topics']
[u'Abbasid Caliphate', u'Abbasid invasion of Asia Minor (782)', u'Abbasid invasion of Asia Minor (806)', u'Az-Zahir (Abbasid caliph)', u'Battle of Kopidnadon', u"Al-Mu'tamid", u'Alid Revolt (762\u201363)', u'Al-Mustansir (Baghdad)', u'Siege of Kamacha', u"Al-Musta'sim"]
[u'Academy of sciences', u'National Academy of Sciences', u'Russian Academy of Sciences', u'Armenian National Academy of Sciences', u'Proceedings of the National Academy of Sciences of the United States of America', u'Academy of Sciences of Moldova', u'Polish Academy 

In [378]:
a_p = [a for a in articles if(len(a[1])>1800)]
df = pd.DataFrame(a_p)
df.to_csv("informative.csv",sep='\t',encoding='utf-8',index=False,header=['title','plot'])

In [376]:
len(a_p)

226

In [371]:
df.head()

Unnamed: 0,0,1
0,A. I. Sabra,Abdelhamid I. Sabra (1924-2013) was a professo...
1,A priori and a posteriori,"The Latin phrases a priori (lit. ""from the ear..."
2,Abbasid,The Abbasid Caliphate (/əˈbæsᵻd/ or /ˈæbəsᵻd/ ...
3,Academy of Sciences,An academy of sciences is a national academy o...
4,Académie des Sciences,The French Academy of Sciences (French: Académ...


In [367]:
for a in articles:
    if(len(a[1])>2000):
        print len(a[1])
        print a[1]
        print "\n\n\n\n\n"

2023
The Abbasid Caliphate (/əˈbæsᵻd/ or /ˈæbəsᵻd/ Arabic: الخلافة العباسية‎ al-Khilāfah al-‘Abbāsīyah) was the third of the Islamic caliphates to succeed the Islamic prophet Muhammad. The Abbasid dynasty descended from Muhammad's youngest uncle, Abbas ibn Abd al-Muttalib (566–653 CE), from whom the dynasty takes its name. They ruled as caliphs, for most of their period from their capital in Baghdad in modern-day Iraq, after assuming authority over the Muslim empire from the Umayyads in 750 CE (132 AH).
The Abbasid caliphate first centered its government in Kufa, but in 762 the caliph Al-Mansur founded the city of Baghdad, north of the Sasanian capital city of Ctesiphon. The choice of a capital so close to Persia proper reflected a growing reliance on Persian bureaucrats, most notably of the Barmakid family, to govern the territories conquered by Arab Muslims, as well as an increasing inclusion of non-Arab Muslims in the ummah. Despite this initial cooperation, the Abbasids of the late

#### Making the files

In [478]:
df = pd.read_csv('../data/narrative2.csv',encoding='utf-8',sep='\t')

l =[]
for i in range(len(df)):
    #print len(df.loc[i]['plot'])
    #print '\n\n\n'
    if(len(df.loc[i]['plot'])>1800):
        l.append(i)

In [479]:
df = pd.read_csv('../data/narrative2.csv',encoding='utf-8',sep='\t')
df = df.loc[l]
write_path='../data/narrative/'
for i in l:
    df_tmp = df.loc[i]
    filename = df_tmp['title'].replace(' ','')
    filename = filename.replace('/','')
    file_tmp = open(write_path+filename+".txt", "wb")
    file_tmp.write(df_tmp["plot"].encode('utf-8'))
    file_tmp.close()
    #df_tmp["plot"].to_csv(df.loc[i][0],sep='\t')

In [453]:
open(write_path+"hello.txt", "wb")

<open file '../data/narrative/hello.txt', mode 'wb' at 0x7f1fdd7fb270>

## Web crawling with beautiful soup

In [441]:
from bs4 import BeautifulSoup
import urllib
import time
import random

r = urllib.urlopen('http://millercenter.org/president/speeches').read()
soup = BeautifulSoup(r)
time.sleep(random.randint(1, 2) * .931467298)
print type(soup)

<class 'bs4.BeautifulSoup'>


In [434]:
urls = soup.findAll('a',attrs={'class':'transcript'})

In [436]:
urls = ['http://millercenter.org/president'+u['href'] for u in urls]

In [437]:
urls

['http://millercenter.org/president/president/obama/speeches/speech-4427',
 'http://millercenter.org/president/president/obama/speeches/speech-4424',
 'http://millercenter.org/president/president/obama/speeches/speech-4453',
 'http://millercenter.org/president/president/obama/speeches/speech-4612',
 'http://millercenter.org/president/president/obama/speeches/speech-5502',
 'http://millercenter.org/president/president/obama/speeches/speech-5548',
 'http://millercenter.org/president/president/obama/speeches/speech-5706',
 'http://millercenter.org/president/president/gwbush/speeches/speech-3645',
 'http://millercenter.org/president/president/gwbush/speeches/speech-3986',
 'http://millercenter.org/president/president/gwbush/speeches/speech-4540',
 'http://millercenter.org/president/president/gwbush/speeches/speech-4541',
 'http://millercenter.org/president/president/gwbush/speeches/speech-4542',
 'http://millercenter.org/president/president/gwbush/speeches/speech-4463',
 'http://millercent