In [4]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#Get initial look at data
train.head()

#lowercase
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['text'].head()

#remove punctuation
train['text'] = train['text'].str.replace('[^\w\s]','')
train['word_count2'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train['text'].head()

#remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['word_count3'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train['text'].head()

freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]



In [5]:
#remove uncommon words
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['text'].head()



0    info found 100 pages 45 mb pdf files wait unti...
1    team members drewes van der laag urllink mail ...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoos toolbar capture urls popupswhich...
Name: text, dtype: object

In [6]:
###remove common words###
freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
#Visualize
train['text'].head()



0    info found 100 pages 45 mb pdf files wait unti...
1    team members drewes van der laag mail ruiyu xi...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoos toolbar capture urls popupswhich...
Name: text, dtype: object

In [7]:
###Remove rare words###
freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
#Visualize
train['text'].head()



0    info found 100 pages 45 mb pdf files wait unti...
1    team members drewes van der laag mail ruiyu xi...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoos toolbar capture urls popupswhich...
Name: text, dtype: object

In [10]:
###Spelling correction###
from textblob import TextBlob
train['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

###Tokenization (dividing the text into a sequence of words or sentences)###
TextBlob(train['text'][1]).words

#Stemming (removal of suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach)
#from nltk.stem import PorterStemmer
#st = PorterStemmer()
#train['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

#Lemmatization (more effective option than stemming because 
# converts the word into its root word, rather than just stripping the suffices)
#*Preferred to stemming*#

from textblob import Word
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['text'].head()

0    info found 100 page 45 mb pdf file wait untill...
1    team member drewes van der laag mail ruiyu xie...
2    het kader van kernfusie op aarde maak je eigen...
3                                      testing testing
4    thanks yahoo toolbar capture url popupswhich m...
Name: text, dtype: object

In [31]:
sentiment = train['text'].apply(lambda x: TextBlob(x).sentiment[0])

In [33]:
train['sentiment'] = sentiment

In [37]:
train.head()

Unnamed: 0,post.id,user.id,sign,date,text,age,word_count2,word_count3,sentiment,gender_female,...,topic_RealEstate,topic_Religion,topic_Science,topic_Sports-Recreation,topic_Student,topic_Technology,topic_Telecommunications,topic_Tourism,topic_Transportation,topic_indUnk
0,1,11869,Leo,"14,May,2004",info found 100 page 45 mb pdf file wait untill...,15,28,15,0.0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,11869,Leo,"13,May,2004",team member drewes van der laag mail ruiyu xie...,15,20,16,0.0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,11869,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,15,4326,2383,0.087728,0,...,0,0,0,0,1,0,0,0,0,0
3,4,11869,Leo,"12,May,2004",testing testing,15,2,2,0.0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,16332,Aquarius,"11,June,2004",thanks yahoo toolbar capture url popupswhich m...,33,65,37,0.159375,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
train = pd.get_dummies(train, columns=['gender', 'topic'])

In [46]:
traindropped = train.drop(['post.id', 'sign', 'date', 'text'], axis=1)

In [47]:
traindropped.head()

Unnamed: 0,user.id,age,word_count2,word_count3,sentiment,gender_female,gender_male,topic_Accounting,topic_Advertising,topic_Agriculture,...,topic_RealEstate,topic_Religion,topic_Science,topic_Sports-Recreation,topic_Student,topic_Technology,topic_Telecommunications,topic_Tourism,topic_Transportation,topic_indUnk
0,11869,15,28,15,0.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,11869,15,20,16,0.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,11869,15,4326,2383,0.087728,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,11869,15,2,2,0.0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,16332,33,65,37,0.159375,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
traindropped.to_csv('trainclean.csv')

In [50]:
testclean = pd.read_csv("testclean.csv")

In [64]:
testclean = testclean.drop(testclean.columns[0], axis=1)

In [54]:
y = traindropped["age"]

In [55]:
x = traindropped
x = x.drop(['age'], axis = 1)

In [65]:
testclean.shape

(238323, 46)

In [70]:
xsmall = x[0:50]
ysmall = y[0:50]
xsmallarray = xsmall.values
ysmallarray = ysmall.values

In [81]:
#Regression testing code
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression

#Make array for x and y values of training set
#x = train.drop['age']
xarraytrain = x.values

#ytrain = train['age']
yarraytrain = y.values

#create linear regression object
lm = LinearRegression()
#Fit regression model for training data
lm.fit(xarraytrain,yarraytrain)
predictions = lm.predict(testclean)

In [82]:
predictionsdf = pd.DataFrame(predictions)

In [83]:
predictionsdf.describe()

Unnamed: 0,0
count,238323.0
mean,23.608497
std,3.4068
min,13.831441
25%,23.346411
50%,24.096648
75%,25.887594
max,163.95239


In [85]:
predictionsdf.head()

Unnamed: 0,0
0,23.928802
1,24.4585
2,17.666821
3,17.804392
4,17.966817


In [86]:
ids = testclean["user.id"]

In [88]:
ids.shape

(238323,)

In [89]:
predictionsdf['userid'] = ids

In [90]:
predictionsdf.head()

Unnamed: 0,0,userid
0,23.928802,4876
1,24.4585,4876
2,17.666821,12227
3,17.804392,12227
4,17.966817,12227


In [94]:
preds = predictionsdf.groupby(['userid']).mean

In [101]:
preds = predictionsdf[0].groupby(predictionsdf['userid']).mean()

In [103]:
preds.to_csv('preds.csv')