# Regression analysis on fake and satire news articles

In [1]:
# reading the excel file which includes all the information
library("readxl")
my_data <- read_excel("data/cohmetrix/output/satirefake_full.xlsx")
head(my_data)

X__1,DESPC,DESSC,DESWC,DESPL,DESPLd,DESSL,DESSLd,DESWLsy,DESWLsyd,⋯,WRDMEAc,WRDPOLc,WRDHYPn,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,id,label
0,6,19,365,3.167,2.137,19.368,10.95,1.6,0.943,⋯,457.425,3.804,5.321,1.826,1.896,51.976,10.782,10.179,100,0
1,4,23,613,5.75,0.957,26.913,12.442,1.475,0.833,⋯,414.503,3.628,5.618,1.688,1.587,54.998,12.209,15.315,101,1
2,4,24,650,6.0,1.826,27.167,15.171,1.382,0.702,⋯,427.278,4.166,5.395,1.488,1.56,62.429,11.28,22.57,102,0
3,5,20,572,4.0,1.0,28.9,12.574,1.507,0.84,⋯,426.274,3.914,6.171,1.91,1.979,50.314,13.347,16.426,103,1
4,5,24,592,4.8,0.447,24.792,11.03,1.492,0.831,⋯,424.694,3.844,5.743,1.588,1.731,55.575,11.636,19.263,104,1
5,10,17,315,1.7,1.567,18.529,12.334,1.616,0.942,⋯,430.746,3.959,6.279,1.788,2.02,51.314,10.705,15.545,105,0


### Principal Component Analysis (PCA)
To account for multicollineary in our regression model, we apply PCA on the full set of Coh-Metrix features first.

In [2]:
# to install the "psych" package, uncomment the two following lines:
# options(download.file.method = "wget")
# install.packages("psych")
library("psych")

# dropping the columns that we do not want in the PCA.
drops <- c("X__1", "label", "id")
x <- my_data[, !(names(my_data) %in% c(drops))]
ids <- my_data["id"]
y <- my_data["label"]

# printing eigenvalues if we want to find the number of components for PCA
# print(eigen(cor(x)))

dim(x)

pca <- psych::principal(x, nfactors=108, rotate="varimax")

# if we want to print the loadings in the output
# print(pca$loadings, cutoff = 0.4, sort = TRUE)

vars = as.matrix(pca$scores)

# writing the loading values in a csv file
write.csv(pca$loadings,'data/cohmetrix/loadings.csv')

## Logistic Regression

In [3]:
# check class bias
table(y)

y
  0   1 
235 186 

In [4]:
# logistic regression using the PCA scores for predicting fake/satire label
# label 0: fake
# label 1: satire

logitMod <- glm(unlist(y) ~ pca$scores, family = binomial(link = "logit"))
summary(logitMod)

# if we want the coefficients in a csv file.
# library(broom)
# write.csv(tidy(logitMod) , "data/coefs.csv")

“glm.fit: fitted probabilities numerically 0 or 1 occurred”


Call:
glm(formula = unlist(y) ~ pca$scores, family = binomial(link = "logit"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9422  -0.3662  -0.0168   0.3091   3.2540  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.6082665  0.2279775  -2.668 0.007628 ** 
pca$scoresRC2    0.0477449  0.2442210   0.195 0.845003    
pca$scoresRC1    0.9348512  0.2595258   3.602 0.000316 ***
pca$scoresRC4   -0.4184547  0.3282988  -1.275 0.202445    
pca$scoresRC5    1.0646215  0.2490032   4.276 1.91e-05 ***
pca$scoresRC8   -0.1590785  0.2001581  -0.795 0.426751    
pca$scoresRC12  -0.9787187  0.4663562  -2.099 0.035848 *  
pca$scoresRC7   -0.2443520  0.2259201  -1.082 0.279436    
pca$scoresRC16  -0.2185818  0.2550803  -0.857 0.391493    
pca$scoresRC63   0.0476309  0.2265603   0.210 0.833484    
pca$scoresRC9    1.0402957  0.3683254   2.824 0.004737 ** 
pca$scoresRC36  -0.0233131  0.2317005  -0.101 0.919854    
pca$scoresRC19  -0.56820

### Stepwise backward regression

In [5]:
# stepwise backwards eliminaton linear regression using the PCA scores but no truth label
aov_model <- step(aov(as.matrix(y) ~ ., data=as.data.frame(pca$scores)), direction="backward")
summary(aov_model)

Start:  AIC=-688.63
as.matrix(y) ~ RC2 + RC1 + RC4 + RC5 + RC8 + RC12 + RC7 + RC16 + 
    RC63 + RC9 + RC36 + RC19 + RC67 + RC13 + RC20 + RC6 + RC23 + 
    RC53 + RC14 + RC17 + RC15 + RC38 + RC30 + RC11 + RC18 + RC33 + 
    RC49 + RC22 + RC31 + RC43 + RC42 + RC3 + RC27 + RC24 + RC32 + 
    RC28 + RC29 + RC34 + RC21 + RC26 + RC25 + RC52 + RC37 + RC58 + 
    RC41 + RC48 + RC35 + RC70 + RC55 + RC40 + RC44 + RC46 + RC57 + 
    RC45 + RC72 + RC39 + RC50 + RC86 + RC47 + RC56 + RC60 + RC71 + 
    RC62 + RC74 + RC73 + RC94 + RC65 + RC10 + RC77 + RC64 + RC61 + 
    RC51 + RC84 + RC69 + RC76 + RC68 + RC66 + RC75 + RC80 + RC54 + 
    RC79 + RC78 + RC83 + RC82 + RC81 + RC88 + RC85 + RC87 + RC90 + 
    RC59 + RC92 + RC89 + RC91 + RC96 + RC93 + RC95 + RC97 + RC98 + 
    RC99 + RC100 + RC102 + RC101 + RC103 + RC104 + RC105 + RC106 + 
    RC107 + RC108

        Df Sum of Sq    RSS     AIC
- RC35   1    0.0000 48.867 -690.63
- RC50   1    0.0001 48.867 -690.63
- RC97   1    0.0006 48.868 -690.63
- RC81

             Df Sum Sq Mean Sq F value   Pr(>F)    
RC1           1   2.03   2.031  14.302 0.000181 ***
RC5           1   2.09   2.093  14.734 0.000145 ***
RC12          1   0.63   0.629   4.427 0.036051 *  
RC63          1   0.38   0.378   2.660 0.103754    
RC9           1   2.44   2.439  17.171 4.23e-05 ***
RC19          1   0.39   0.393   2.764 0.097253 .  
RC67          1   1.66   1.664  11.717 0.000688 ***
RC20          1   0.47   0.473   3.331 0.068778 .  
RC6           1   3.57   3.571  25.143 8.24e-07 ***
RC23          1   0.39   0.388   2.731 0.099237 .  
RC53          1   0.57   0.566   3.987 0.046585 *  
RC17          1   0.99   0.993   6.990 0.008541 ** 
RC15          1   0.30   0.296   2.085 0.149552    
RC33          1   7.91   7.912  55.703 5.97e-13 ***
RC22          1   2.10   2.104  14.810 0.000140 ***
RC42          1   6.35   6.355  44.740 8.22e-11 ***
RC3           1   0.54   0.540   3.804 0.051886 .  
RC24          1   0.66   0.659   4.642 0.031842 *  
RC29        

In [8]:
my_data <- as.data.frame(pca$scores)
my_cols <- c("RC1", "RC5", "RC12", "RC9", "RC67", "RC6", "RC53", "RC17", "RC33", "RC22", "RC42", "RC24", "RC26", "RC48", "RC55", "RC40", "RC47", "RC61", "RC75", "RC83", "RC87", "RC104")
my_cols_orig <- c("RC1", "RC5", "RC12", "RC63", "RC9", "RC19", "RC67", "RC20", "RC6", "RC23", "RC53", "RC17", "RC15", "RC33", "RC22", "RC42", "RC3", "RC24", "RC29", "RC21", "RC26", "RC52", "RC37", "RC58", "RC48", "RC55", "RC40", "RC45", "RC86", "RC47", "RC73", "RC94", "RC65", "RC10", "RC61", "RC84", "RC76", "RC75", "RC83", "RC82", "RC87", "RC59", "RC89", "RC93", "RC103", "RC104")
my_data1 <- my_data[, (names(my_data) %in% c(my_cols))]
my_data2 <- my_data[, (names(my_data) %in% c(my_cols_orig))]
write.csv(cbind(ids, my_data1, y), file = "data/cohmetrix/classification.csv")
write.csv(cbind(ids, my_data2, y), file = "data/cohmetrix/classification_orig.csv")