In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

In [2]:
data_1 = pd.read_csv("train.csv")
data_2 = pd.read_csv("test.csv")

In [3]:
data_1.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
data_2.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [5]:
data_1.shape

(2834, 6)

In [6]:
data_2.shape

(7, 4)

In [7]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [8]:
data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         7 non-null      object
 1   url_legal  3 non-null      object
 2   license    3 non-null      object
 3   excerpt    7 non-null      object
dtypes: object(4)
memory usage: 352.0+ bytes


In [9]:
data_1.isnull().sum()/100

id                 0.00
url_legal         20.04
license           20.04
excerpt            0.00
target             0.00
standard_error     0.00
dtype: float64

In [10]:
data_2.isnull().sum()/100

id           0.00
url_legal    0.04
license      0.04
excerpt      0.00
dtype: float64

In [11]:
data_1 = data_1.drop(["id","url_legal","license","standard_error"],axis = 1)
data_1.head()

Unnamed: 0,excerpt,target
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197


In [12]:
data_2 = data_2.drop(["id","url_legal","license"],axis=1)
data_2.head()

Unnamed: 0,excerpt
0,My hope lay in Jack's promise that he would ke...
1,Dotty continued to go to Mrs. Gray's every nig...
2,It was a bright and cheerful scene that greete...
3,Cell division is the process by which a parent...
4,Debugging is the process of finding and resolv...


In [13]:
data_1["excerpt"]

0       When the young people returned to the ballroom...
1       All through dinner time, Mrs. Fayre was somewh...
2       As Roger had predicted, the snow departed as q...
3       And outside before the palace a great garden w...
4       Once upon a time there were Three Bears who li...
                              ...                        
2829    When you think of dinosaurs and where they liv...
2830    So what is a solid? Solids are usually hard be...
2831    The second state of matter we will discuss is ...
2832    Solids are shapes that you can actually touch....
2833    Animals are made of many cells. They eat thing...
Name: excerpt, Length: 2834, dtype: object

In [14]:
data_2["excerpt"]

0    My hope lay in Jack's promise that he would ke...
1    Dotty continued to go to Mrs. Gray's every nig...
2    It was a bright and cheerful scene that greete...
3    Cell division is the process by which a parent...
4    Debugging is the process of finding and resolv...
5    To explain transitivity, let us look first at ...
6    Milka and John are playing in the garden. Her ...
Name: excerpt, dtype: object

In [15]:
data_1["excerpt"] = data_1["excerpt"].str.lower()
data_2["excerpt"] = data_2["excerpt"].str.lower()

In [16]:
import string
punc = string.punctuation

def remove_punc(x):
    translator = str.maketrans(" "," ",punc)
    return x.translate(translator)

data_1["excerpt"] = data_1["excerpt"].apply(lambda x:remove_punc(x))
data_2["excerpt"] = data_2["excerpt"].apply(lambda x:remove_punc(x))

In [17]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

data_1["excerpt"] = data_1["excerpt"].apply(remove_stopwords)
data_2["excerpt"] = data_2["excerpt"].apply(remove_stopwords)

In [18]:
data_1.tail()

Unnamed: 0,excerpt,target
2829,think dinosaurs lived picture see hot steamy s...,1.71139
2830,solid solids usually hard molecules packed tog...,0.189476
2831,second state matter discuss liquid solids hard...,0.255209
2832,solids shapes actually touch three dimensions ...,-0.215279
2833,animals made many cells eat things digest insi...,0.300779


In [19]:
data_2.tail()

Unnamed: 0,excerpt
2,bright cheerful scene greeted eyes captain ray...
3,cell division process parent cell divides two ...
4,debugging process finding resolving defects pr...
5,explain transitivity let us look first totally...
6,milka john playing garden little sister playin...


In [20]:
import re

def remove_digits(text):
    return re.sub("[0-9]+"," ",text)

data_1["excerpt"] = data_1["excerpt"].apply(remove_digits)
data_2["excerpt"] = data_2["excerpt"].apply(remove_digits)

In [21]:
def remove_url(text):
    return re.sub("((www.[^s]+) | (https?://[^s]+))"," ",text)

data_1["excerpt"] = data_1["excerpt"].apply(remove_url)
data_2["excerpt"] = data_2["excerpt"].apply(remove_url)

In [22]:
regex = r'\b(\w+)(?:\W+\1\b)+'

def clean_repeat_words(text):
    return re.sub(regex," ",text)

data_1["excerpt"] = data_1["excerpt"].apply(clean_repeat_words)
data_2["excerpt"] = data_2["excerpt"].apply(clean_repeat_words)

In [23]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming_data(data):
  text = [ps.stem(data) for word in data]
  return data

data_1["excerpt"] = data_1["excerpt"].apply(lambda x: stemming_data(x))
data_2["excerpt"] = data_2["excerpt"].apply(lambda x: stemming_data(x))

In [24]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatizer_on_text(data):
    text = [wnl.lemmatize(word) for word in data]
    return data

data_1["excerpt"] = data_1["excerpt"].apply(lambda x: lemmatizer_on_text(x))
data_2["excerpt"] = data_2["excerpt"].apply(lambda x: lemmatizer_on_text(x))

In [25]:
X = data_1.drop('target', axis = 1)
y = data_1['target']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [28]:
cv.fit(X_train['excerpt'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [29]:
X_train = cv.transform(X_train['excerpt'])
X_test = cv.transform(X_test['excerpt'])

In [30]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
y_pred = lr.predict(X_test)
y_pred

array([ 4.10669302e-01, -1.11916006e+00, -1.16969753e-01, -2.52431483e+00,
       -5.26172460e-01, -1.36353936e+00,  1.05893022e+00, -7.50170960e-01,
       -1.20727875e+00, -9.69001441e-01, -1.71095812e+00,  3.14047908e-01,
       -7.20987938e-01, -8.67026149e-01, -8.10250327e-01,  5.14180927e-01,
       -2.13184896e-01, -5.59709835e-01, -7.30123130e-01, -9.20280798e-01,
       -5.72492665e-01, -1.74626949e+00, -7.52864823e-01, -1.22101139e+00,
       -5.85627415e-01, -1.76132264e+00, -1.73534164e+00, -7.10104218e-01,
       -4.94207763e-01,  1.11902244e-01, -1.36635814e+00, -1.07059098e+00,
        1.49811836e-01, -5.46815987e-01, -7.73050756e-01, -1.04452710e+00,
       -1.26274008e+00, -2.89829611e-01, -1.56567066e+00, -4.40662971e-01,
       -2.42811847e+00, -1.14740513e+00, -1.40788581e+00,  1.01291942e+00,
        3.00934089e-01, -9.31779948e-02, -3.20535190e-01, -1.59776094e+00,
       -1.48993473e+00, -1.34880739e+00, -1.49745368e+00, -1.97858764e+00,
       -7.48543346e-01, -

In [33]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE:", rmse)

RMSE: 0.815735352502365


In [34]:
#test_data
X_test_cv = cv.transform(data_2['excerpt'])

In [35]:
y_test_pred = lr.predict(X_test_cv)
y_test_pred

array([-1.63789834, -0.02290583, -0.34503597, -1.48109537, -1.795555  ,
       -0.72504585, -0.70696829])