# Regression analysis on fake and satire news articles

In [2]:
# reading the excel file which includes all the information
library("readxl")
my_data <- read_excel("data/satire_fake_full.xlsx")
head(my_data)

X__1,DESSC,DESWC,DESPL,DESSL,DESSLd,DESWLsy,DESWLsyd,DESWLlt,DESWLltd,⋯,WRDIMGc,WRDMEAc,WRDPOLc,WRDHYPn,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,label
0,1,38,1,38.0,0.0,1.526,0.893,4.526,2.555,⋯,404.2,430.0,4.821,5.963,1.807,1.932,39.165,17.237,5.569,1
1,12,352,12,30.083,13.228,1.716,1.007,5.182,2.834,⋯,419.217,439.354,3.764,5.231,1.786,1.9,31.888,16.099,5.721,1
2,1,2,1,2.0,0.0,2.5,2.121,5.5,2.121,⋯,374.0,438.0,1.667,0.0,0.0,0.0,0.0,14.69,-4.174,1
3,1,41,1,41.0,0.0,1.634,0.994,4.902,2.634,⋯,406.0,436.875,4.0,4.631,1.611,1.614,26.984,19.681,-0.806,1
4,35,518,35,14.829,7.298,1.444,0.749,4.407,2.281,⋯,412.187,419.177,4.022,5.926,1.702,1.718,69.651,7.221,15.645,1
5,22,550,22,25.227,9.88,1.722,1.041,5.015,2.801,⋯,421.653,450.525,3.232,5.415,1.677,1.986,35.779,14.48,9.306,0


### Principal Component Analysis (PCA)
To account for multicollineary in our regression model, we apply PCA on the full set of Coh-Metrix features first.

In [3]:
# to install the "psych" package, uncomment the two following lines:
# options(download.file.method = "wget")
# install.packages("psych")
library("psych")

# dropping the columns that we do not want in the PCA.
drops <- c("X__1", "label", "DESPL")
x <- my_data[, !(names(my_data) %in% c(drops))]
y <- my_data["label"]

# printing eigenvalues if we want to find the number of components for PCA
# print(eigen(cor(x)))

dim(x)

pca <- psych::principal(x, nfactors=103, rotate="varimax")

# if we want to print the loadings in the output
# print(pca$loadings, cutoff = 0.4, sort = TRUE)

vars = as.matrix(pca$scores)

# writing the loading values in a csv file
write.csv(pca$loadings,'data/loadings.csv')

## Linear Regression

In [4]:
# linear regression using the PCA scores for predicting the number of shares
lin_model <- lm(unlist(y) ~ pca$scores)
summary(lin_model)


Call:
lm(formula = unlist(y) ~ pca$scores)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.05649 -0.29919 -0.00396  0.29684  0.82144 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      4.428e-01  2.142e-02  20.671  < 2e-16 ***
pca$scoresRC1   -1.504e-02  2.145e-02  -0.701 0.483658    
pca$scoresRC7    4.251e-03  2.145e-02   0.198 0.843036    
pca$scoresRC2   -2.274e-03  2.145e-02  -0.106 0.915654    
pca$scoresRC3    3.364e-02  2.145e-02   1.568 0.117869    
pca$scoresRC4    1.541e-02  2.145e-02   0.719 0.472963    
pca$scoresRC76   1.293e-02  2.145e-02   0.603 0.547061    
pca$scoresRC5    8.118e-02  2.145e-02   3.785 0.000185 ***
pca$scoresRC8    1.633e-02  2.145e-02   0.761 0.447167    
pca$scoresRC6   -3.015e-02  2.145e-02  -1.406 0.160860    
pca$scoresRC13   1.198e-02  2.145e-02   0.559 0.576891    
pca$scoresRC11  -2.157e-02  2.145e-02  -1.006 0.315401    
pca$scoresRC25   1.316e-03  2.145e-02   0.061 0.951128    
pca$scoresR

### Stepwise backward linear regression

In [5]:
# stepwise backwards eliminaton linear regression using the PCA scores but no truth label
aov_model <- step(aov(as.matrix(y) ~ .,data=as.data.frame(pca$scores)),direction="backward")
summary(aov_model)

Start:  AIC=-597.46
as.matrix(y) ~ RC1 + RC7 + RC2 + RC3 + RC4 + RC76 + RC5 + RC8 + 
    RC6 + RC13 + RC11 + RC25 + RC9 + RC73 + RC15 + RC31 + RC10 + 
    RC42 + RC21 + RC14 + RC19 + RC26 + RC34 + RC27 + RC16 + RC54 + 
    RC41 + RC28 + RC24 + RC20 + RC40 + RC32 + RC12 + RC30 + RC35 + 
    RC29 + RC17 + RC18 + RC37 + RC22 + RC38 + RC33 + RC55 + RC44 + 
    RC51 + RC48 + RC61 + RC52 + RC47 + RC45 + RC43 + RC49 + RC50 + 
    RC57 + RC46 + RC72 + RC36 + RC59 + RC53 + RC65 + RC63 + RC67 + 
    RC56 + RC62 + RC60 + RC58 + RC64 + RC80 + RC66 + RC69 + RC23 + 
    RC77 + RC71 + RC70 + RC75 + RC91 + RC79 + RC82 + RC74 + RC83 + 
    RC68 + RC39 + RC81 + RC95 + RC84 + RC86 + RC87 + RC90 + RC89 + 
    RC85 + RC88 + RC93 + RC92 + RC78 + RC94 + RC96 + RC97 + RC98 + 
    RC99 + RC102 + RC100 + RC103 + RC101

        Df Sum of Sq    RSS     AIC
- RC51   1    0.0000 57.907 -599.46
- RC57   1    0.0001 57.907 -599.46
- RC25   1    0.0007 57.908 -599.45
- RC2    1    0.0021 57.909 -599.44
- RC24   1    0

             Df Sum Sq Mean Sq F value   Pr(>F)    
RC3           1   0.46   0.464   2.699 0.101253    
RC5           1   2.70   2.702  15.723 8.78e-05 ***
RC6           1   0.37   0.373   2.168 0.141724    
RC9           1   0.98   0.985   5.730 0.017165 *  
RC73          1   2.82   2.817  16.392 6.26e-05 ***
RC15          1   1.13   1.134   6.601 0.010580 *  
RC19          1   5.19   5.191  30.202 7.20e-08 ***
RC34          1   0.88   0.879   5.116 0.024274 *  
RC27          1   0.39   0.389   2.265 0.133145    
RC20          1   0.83   0.831   4.835 0.028504 *  
RC30          1   3.07   3.067  17.846 3.01e-05 ***
RC35          1   0.99   0.993   5.779 0.016701 *  
RC37          1   0.66   0.659   3.833 0.050980 .  
RC55          1   1.91   1.913  11.128 0.000935 ***
RC44          1   0.71   0.715   4.160 0.042100 *  
RC61          1   0.98   0.977   5.682 0.017638 *  
RC47          1   0.40   0.400   2.330 0.127781    
RC45          1   0.77   0.770   4.479 0.034979 *  
RC43        